/* $Id: IEMAllCImpl.cpp $ */
/** @file
 * IEM - Instruction Implementation in C/C++ (code include).
 */

/*
 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
 *
 * This file is part of VirtualBox base platform packages, as
 * available from https://www.virtualbox.org.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, in version 3 of the
 * License.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <https://www.gnu.org/licenses>.
 *
 * SPDX-License-Identifier: GPL-3.0-only
 */


/*********************************************************************************************************************************
*   Header Files                                                                                                                 *
*********************************************************************************************************************************/
#define LOG_GROUP   LOG_GROUP_IEM
#define VMCPU_INCL_CPUM_GST_CTX
#include <VBox/vmm/iem.h>
#include <VBox/vmm/cpum.h>
#include <VBox/vmm/apic.h>
#include <VBox/vmm/pdm.h>
#include <VBox/vmm/pgm.h>
#include <VBox/vmm/iom.h>
#include <VBox/vmm/em.h>
#include <VBox/vmm/hm.h>
#include <VBox/vmm/nem.h>
#include <VBox/vmm/gim.h>
#ifdef VBOX_WITH_NESTED_HWVIRT_SVM
# include <VBox/vmm/em.h>
# include <VBox/vmm/hm_svm.h>
#endif
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
# include <VBox/vmm/hmvmxinline.h>
#endif
#ifndef VBOX_WITHOUT_CPUID_HOST_CALL
# include <VBox/vmm/cpuidcall.h>
#endif
#include <VBox/vmm/tm.h>
#include <VBox/vmm/dbgf.h>
#include <VBox/vmm/dbgftrace.h>
#include "IEMInternal.h"
#include <VBox/vmm/vmcc.h>
#include <VBox/log.h>
#include <VBox/err.h>
#include <VBox/param.h>
#include <VBox/dis.h>
#include <VBox/disopcode.h>
#include <iprt/asm-math.h>
#include <iprt/assert.h>
#include <iprt/string.h>
#include <iprt/x86.h>

#include "IEMInline.h"


/*********************************************************************************************************************************
*   Defined Constants And Macros                                                                                                 *
*********************************************************************************************************************************/
/**
 * Flushes the prefetch buffer, light version.
 */
#ifndef IEM_WITH_CODE_TLB
# define IEM_FLUSH_PREFETCH_LIGHT(a_pVCpu, a_cbInstr) do { (a_pVCpu)->iem.s.cbOpcode   = (a_cbInstr); } while (0)
#else
# define IEM_FLUSH_PREFETCH_LIGHT(a_pVCpu, a_cbInstr) do { } while (0)
#endif

/**
 * Flushes the prefetch buffer, heavy version.
 */
#ifndef IEM_WITH_CODE_TLB
# define IEM_FLUSH_PREFETCH_HEAVY(a_pVCpu, a_cbInstr) do { (a_pVCpu)->iem.s.cbOpcode   = (a_cbInstr); } while (0)
#else
# if 1
#  define IEM_FLUSH_PREFETCH_HEAVY(a_pVCpu, a_cbInstr) do { (a_pVCpu)->iem.s.pbInstrBuf = NULL; } while (0)
# else
#  define IEM_FLUSH_PREFETCH_HEAVY(a_pVCpu, a_cbInstr) do { } while (0)
# endif
#endif


/** @name Misc Helpers
 * @{
 */


/**
 * Worker function for iemHlpCheckPortIOPermission, don't call directly.
 *
 * @returns Strict VBox status code.
 *
 * @param   pVCpu               The cross context virtual CPU structure of the calling thread.
 * @param   u16Port             The port number.
 * @param   cbOperand           The operand size.
 */
static VBOXSTRICTRC iemHlpCheckPortIOPermissionBitmap(PVMCPUCC pVCpu, uint16_t u16Port, uint8_t cbOperand)
{
    /* The TSS bits we're interested in are the same on 386 and AMD64. */
    AssertCompile(AMD64_SEL_TYPE_SYS_TSS_BUSY  == X86_SEL_TYPE_SYS_386_TSS_BUSY);
    AssertCompile(AMD64_SEL_TYPE_SYS_TSS_AVAIL == X86_SEL_TYPE_SYS_386_TSS_AVAIL);
    AssertCompileMembersAtSameOffset(X86TSS32, offIoBitmap, X86TSS64, offIoBitmap);
    AssertCompile(sizeof(X86TSS32) == sizeof(X86TSS64));

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_TR);

    /*
     * Check the TSS type, 16-bit TSSes doesn't have any I/O permission bitmap.
     */
    Assert(!pVCpu->cpum.GstCtx.tr.Attr.n.u1DescType);
    if (RT_UNLIKELY(   pVCpu->cpum.GstCtx.tr.Attr.n.u4Type != AMD64_SEL_TYPE_SYS_TSS_BUSY
                    && pVCpu->cpum.GstCtx.tr.Attr.n.u4Type != AMD64_SEL_TYPE_SYS_TSS_AVAIL))
    {
        Log(("iemHlpCheckPortIOPermissionBitmap: Port=%#x cb=%d - TSS type %#x (attr=%#x) has no I/O bitmap -> #GP(0)\n",
             u16Port, cbOperand, pVCpu->cpum.GstCtx.tr.Attr.n.u4Type, pVCpu->cpum.GstCtx.tr.Attr.u));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /*
     * Read the bitmap offset (may #PF).
     */
    uint16_t offBitmap;
    VBOXSTRICTRC rcStrict = iemMemFetchSysU16(pVCpu, &offBitmap, UINT8_MAX,
                                              pVCpu->cpum.GstCtx.tr.u64Base + RT_UOFFSETOF(X86TSS64, offIoBitmap));
    if (rcStrict != VINF_SUCCESS)
    {
        Log(("iemHlpCheckPortIOPermissionBitmap: Error reading offIoBitmap (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
        return rcStrict;
    }

    /*
     * The bit range from u16Port to (u16Port + cbOperand - 1), however intel
     * describes the CPU actually reading two bytes regardless of whether the
     * bit range crosses a byte boundrary.  Thus the + 1 in the test below.
     */
    uint32_t offFirstBit = (uint32_t)u16Port / 8 + offBitmap;
    /** @todo check if real CPUs ensures that offBitmap has a minimum value of
     *        for instance sizeof(X86TSS32). */
    if (offFirstBit + 1 > pVCpu->cpum.GstCtx.tr.u32Limit) /* the limit is inclusive */
    {
        Log(("iemHlpCheckPortIOPermissionBitmap: offFirstBit=%#x + 1 is beyond u32Limit=%#x -> #GP(0)\n",
             offFirstBit, pVCpu->cpum.GstCtx.tr.u32Limit));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /*
     * Read the necessary bits.
     */
    /** @todo Test the assertion in the intel manual that the CPU reads two
     *        bytes.  The question is how this works wrt to \#PF and \#GP on the
     *        2nd byte when it's not required. */
    uint16_t bmBytes = UINT16_MAX;
    rcStrict = iemMemFetchSysU16(pVCpu, &bmBytes, UINT8_MAX, pVCpu->cpum.GstCtx.tr.u64Base + offFirstBit);
    if (rcStrict != VINF_SUCCESS)
    {
        Log(("iemHlpCheckPortIOPermissionBitmap: Error reading I/O bitmap @%#x (%Rrc)\n", offFirstBit, VBOXSTRICTRC_VAL(rcStrict)));
        return rcStrict;
    }

    /*
     * Perform the check.
     */
    uint16_t fPortMask = (1 << cbOperand) - 1;
    bmBytes >>= (u16Port & 7);
    if (bmBytes & fPortMask)
    {
        Log(("iemHlpCheckPortIOPermissionBitmap: u16Port=%#x LB %u - access denied (bm=%#x mask=%#x) -> #GP(0)\n",
             u16Port, cbOperand, bmBytes, fPortMask));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    return VINF_SUCCESS;
}


/**
 * Checks if we are allowed to access the given I/O port, raising the
 * appropriate exceptions if we aren't (or if the I/O bitmap is not
 * accessible).
 *
 * @returns Strict VBox status code.
 *
 * @param   pVCpu               The cross context virtual CPU structure of the calling thread.
 * @param   u16Port             The port number.
 * @param   cbOperand           The operand size.
 */
DECLINLINE(VBOXSTRICTRC) iemHlpCheckPortIOPermission(PVMCPUCC pVCpu, uint16_t u16Port, uint8_t cbOperand)
{
    X86EFLAGS Efl;
    Efl.u = IEMMISC_GET_EFL(pVCpu);
    if (   (pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE)
        && (    pVCpu->iem.s.uCpl > Efl.Bits.u2IOPL
            ||  Efl.Bits.u1VM) )
        return iemHlpCheckPortIOPermissionBitmap(pVCpu, u16Port, cbOperand);
    return VINF_SUCCESS;
}


#if 0
/**
 * Calculates the parity bit.
 *
 * @returns true if the bit is set, false if not.
 * @param   u8Result            The least significant byte of the result.
 */
static bool iemHlpCalcParityFlag(uint8_t u8Result)
{
    /*
     * Parity is set if the number of bits in the least significant byte of
     * the result is even.
     */
    uint8_t cBits;
    cBits  = u8Result & 1;              /* 0 */
    u8Result >>= 1;
    cBits += u8Result & 1;
    u8Result >>= 1;
    cBits += u8Result & 1;
    u8Result >>= 1;
    cBits += u8Result & 1;
    u8Result >>= 1;
    cBits += u8Result & 1;              /* 4 */
    u8Result >>= 1;
    cBits += u8Result & 1;
    u8Result >>= 1;
    cBits += u8Result & 1;
    u8Result >>= 1;
    cBits += u8Result & 1;
    return !(cBits & 1);
}
#endif /* not used */


/**
 * Updates the specified flags according to a 8-bit result.
 *
 * @param   pVCpu               The cross context virtual CPU structure of the calling thread.
 * @param   u8Result            The result to set the flags according to.
 * @param   fToUpdate           The flags to update.
 * @param   fUndefined          The flags that are specified as undefined.
 */
static void iemHlpUpdateArithEFlagsU8(PVMCPUCC pVCpu, uint8_t u8Result, uint32_t fToUpdate, uint32_t fUndefined)
{
    uint32_t fEFlags = pVCpu->cpum.GstCtx.eflags.u;
    iemAImpl_test_u8(&u8Result, u8Result, &fEFlags);
    pVCpu->cpum.GstCtx.eflags.u &= ~(fToUpdate | fUndefined);
    pVCpu->cpum.GstCtx.eflags.u |= (fToUpdate | fUndefined) & fEFlags;
}


/**
 * Updates the specified flags according to a 16-bit result.
 *
 * @param   pVCpu               The cross context virtual CPU structure of the calling thread.
 * @param   u16Result           The result to set the flags according to.
 * @param   fToUpdate           The flags to update.
 * @param   fUndefined          The flags that are specified as undefined.
 */
static void iemHlpUpdateArithEFlagsU16(PVMCPUCC pVCpu, uint16_t u16Result, uint32_t fToUpdate, uint32_t fUndefined)
{
    uint32_t fEFlags = pVCpu->cpum.GstCtx.eflags.u;
    iemAImpl_test_u16(&u16Result, u16Result, &fEFlags);
    pVCpu->cpum.GstCtx.eflags.u &= ~(fToUpdate | fUndefined);
    pVCpu->cpum.GstCtx.eflags.u |= (fToUpdate | fUndefined) & fEFlags;
}


/**
 * Helper used by iret.
 *
 * @param   pVCpu               The cross context virtual CPU structure of the calling thread.
 * @param   uCpl                The new CPL.
 * @param   pSReg               Pointer to the segment register.
 */
static void iemHlpAdjustSelectorForNewCpl(PVMCPUCC pVCpu, uint8_t uCpl, PCPUMSELREG pSReg)
{
    Assert(CPUMSELREG_ARE_HIDDEN_PARTS_VALID(pVCpu, pSReg));
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_SREG_MASK);

    if (   uCpl > pSReg->Attr.n.u2Dpl
        && pSReg->Attr.n.u1DescType /* code or data, not system */
        &&    (pSReg->Attr.n.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_CONF))
           !=                         (X86_SEL_TYPE_CODE | X86_SEL_TYPE_CONF)) /* not conforming code */
        iemHlpLoadNullDataSelectorProt(pVCpu, pSReg, 0);
}


/**
 * Indicates that we have modified the FPU state.
 *
 * @param   pVCpu               The cross context virtual CPU structure of the calling thread.
 */
DECLINLINE(void) iemHlpUsedFpu(PVMCPUCC pVCpu)
{
    CPUMSetChangedFlags(pVCpu, CPUM_CHANGED_FPU_REM);
}

/** @} */

/** @name C Implementations
 * @{
 */

/**
 * Implements a 16-bit popa.
 */
IEM_CIMPL_DEF_0(iemCImpl_popa_16)
{
    RTGCPTR         GCPtrStart  = iemRegGetEffRsp(pVCpu);
    RTGCPTR         GCPtrLast   = GCPtrStart + 15;
    VBOXSTRICTRC    rcStrict;

    /*
     * The docs are a bit hard to comprehend here, but it looks like we wrap
     * around in real mode as long as none of the individual "popa" crosses the
     * end of the stack segment.  In protected mode we check the whole access
     * in one go.  For efficiency, only do the word-by-word thing if we're in
     * danger of wrapping around.
     */
    /** @todo do popa boundary / wrap-around checks.  */
    if (RT_UNLIKELY(   IEM_IS_REAL_OR_V86_MODE(pVCpu)
                    && (pVCpu->cpum.GstCtx.cs.u32Limit < GCPtrLast)) ) /* ASSUMES 64-bit RTGCPTR */
    {
        /* word-by-word */
        RTUINT64U TmpRsp;
        TmpRsp.u = pVCpu->cpum.GstCtx.rsp;
        rcStrict = iemMemStackPopU16Ex(pVCpu, &pVCpu->cpum.GstCtx.di, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU16Ex(pVCpu, &pVCpu->cpum.GstCtx.si, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU16Ex(pVCpu, &pVCpu->cpum.GstCtx.bp, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
        {
            iemRegAddToRspEx(pVCpu, &TmpRsp, 2); /* sp */
            rcStrict = iemMemStackPopU16Ex(pVCpu, &pVCpu->cpum.GstCtx.bx, &TmpRsp);
        }
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU16Ex(pVCpu, &pVCpu->cpum.GstCtx.dx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU16Ex(pVCpu, &pVCpu->cpum.GstCtx.cx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU16Ex(pVCpu, &pVCpu->cpum.GstCtx.ax, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rsp = TmpRsp.u;
            rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }
    }
    else
    {
        uint16_t const *pa16Mem = NULL;
        rcStrict = iemMemMap(pVCpu, (void **)&pa16Mem, 16, X86_SREG_SS, GCPtrStart, IEM_ACCESS_STACK_R, sizeof(*pa16Mem) - 1);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.di = pa16Mem[7 - X86_GREG_xDI];
            pVCpu->cpum.GstCtx.si = pa16Mem[7 - X86_GREG_xSI];
            pVCpu->cpum.GstCtx.bp = pa16Mem[7 - X86_GREG_xBP];
            /* skip sp */
            pVCpu->cpum.GstCtx.bx = pa16Mem[7 - X86_GREG_xBX];
            pVCpu->cpum.GstCtx.dx = pa16Mem[7 - X86_GREG_xDX];
            pVCpu->cpum.GstCtx.cx = pa16Mem[7 - X86_GREG_xCX];
            pVCpu->cpum.GstCtx.ax = pa16Mem[7 - X86_GREG_xAX];
            rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)pa16Mem, IEM_ACCESS_STACK_R);
            if (rcStrict == VINF_SUCCESS)
            {
                iemRegAddToRsp(pVCpu, 16);
                rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
            }
        }
    }
    return rcStrict;
}


/**
 * Implements a 32-bit popa.
 */
IEM_CIMPL_DEF_0(iemCImpl_popa_32)
{
    RTGCPTR         GCPtrStart  = iemRegGetEffRsp(pVCpu);
    RTGCPTR         GCPtrLast   = GCPtrStart + 31;
    VBOXSTRICTRC    rcStrict;

    /*
     * The docs are a bit hard to comprehend here, but it looks like we wrap
     * around in real mode as long as none of the individual "popa" crosses the
     * end of the stack segment.  In protected mode we check the whole access
     * in one go.  For efficiency, only do the word-by-word thing if we're in
     * danger of wrapping around.
     */
    /** @todo do popa boundary / wrap-around checks.  */
    if (RT_UNLIKELY(   IEM_IS_REAL_OR_V86_MODE(pVCpu)
                    && (pVCpu->cpum.GstCtx.cs.u32Limit < GCPtrLast)) ) /* ASSUMES 64-bit RTGCPTR */
    {
        /* word-by-word */
        RTUINT64U TmpRsp;
        TmpRsp.u = pVCpu->cpum.GstCtx.rsp;
        rcStrict = iemMemStackPopU32Ex(pVCpu, &pVCpu->cpum.GstCtx.edi, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU32Ex(pVCpu, &pVCpu->cpum.GstCtx.esi, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU32Ex(pVCpu, &pVCpu->cpum.GstCtx.ebp, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
        {
            iemRegAddToRspEx(pVCpu, &TmpRsp, 2); /* sp */
            rcStrict = iemMemStackPopU32Ex(pVCpu, &pVCpu->cpum.GstCtx.ebx, &TmpRsp);
        }
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU32Ex(pVCpu, &pVCpu->cpum.GstCtx.edx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU32Ex(pVCpu, &pVCpu->cpum.GstCtx.ecx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPopU32Ex(pVCpu, &pVCpu->cpum.GstCtx.eax, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
        {
#if 1  /** @todo what actually happens with the high bits when we're in 16-bit mode? */
            pVCpu->cpum.GstCtx.rdi &= UINT32_MAX;
            pVCpu->cpum.GstCtx.rsi &= UINT32_MAX;
            pVCpu->cpum.GstCtx.rbp &= UINT32_MAX;
            pVCpu->cpum.GstCtx.rbx &= UINT32_MAX;
            pVCpu->cpum.GstCtx.rdx &= UINT32_MAX;
            pVCpu->cpum.GstCtx.rcx &= UINT32_MAX;
            pVCpu->cpum.GstCtx.rax &= UINT32_MAX;
#endif
            pVCpu->cpum.GstCtx.rsp = TmpRsp.u;
            rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }
    }
    else
    {
        uint32_t const *pa32Mem;
        rcStrict = iemMemMap(pVCpu, (void **)&pa32Mem, 32, X86_SREG_SS, GCPtrStart, IEM_ACCESS_STACK_R, sizeof(*pa32Mem) - 1);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rdi = pa32Mem[7 - X86_GREG_xDI];
            pVCpu->cpum.GstCtx.rsi = pa32Mem[7 - X86_GREG_xSI];
            pVCpu->cpum.GstCtx.rbp = pa32Mem[7 - X86_GREG_xBP];
            /* skip esp */
            pVCpu->cpum.GstCtx.rbx = pa32Mem[7 - X86_GREG_xBX];
            pVCpu->cpum.GstCtx.rdx = pa32Mem[7 - X86_GREG_xDX];
            pVCpu->cpum.GstCtx.rcx = pa32Mem[7 - X86_GREG_xCX];
            pVCpu->cpum.GstCtx.rax = pa32Mem[7 - X86_GREG_xAX];
            rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)pa32Mem, IEM_ACCESS_STACK_R);
            if (rcStrict == VINF_SUCCESS)
            {
                iemRegAddToRsp(pVCpu, 32);
                rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
            }
        }
    }
    return rcStrict;
}


/**
 * Implements a 16-bit pusha.
 */
IEM_CIMPL_DEF_0(iemCImpl_pusha_16)
{
    RTGCPTR         GCPtrTop    = iemRegGetEffRsp(pVCpu);
    RTGCPTR         GCPtrBottom = GCPtrTop - 15;
    VBOXSTRICTRC    rcStrict;

    /*
     * The docs are a bit hard to comprehend here, but it looks like we wrap
     * around in real mode as long as none of the individual "pushd" crosses the
     * end of the stack segment.  In protected mode we check the whole access
     * in one go.  For efficiency, only do the word-by-word thing if we're in
     * danger of wrapping around.
     */
    /** @todo do pusha boundary / wrap-around checks.  */
    if (RT_UNLIKELY(   GCPtrBottom > GCPtrTop
                    && IEM_IS_REAL_OR_V86_MODE(pVCpu) ) )
    {
        /* word-by-word */
        RTUINT64U TmpRsp;
        TmpRsp.u = pVCpu->cpum.GstCtx.rsp;
        rcStrict = iemMemStackPushU16Ex(pVCpu, pVCpu->cpum.GstCtx.ax, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU16Ex(pVCpu, pVCpu->cpum.GstCtx.cx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU16Ex(pVCpu, pVCpu->cpum.GstCtx.dx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU16Ex(pVCpu, pVCpu->cpum.GstCtx.bx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU16Ex(pVCpu, pVCpu->cpum.GstCtx.sp, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU16Ex(pVCpu, pVCpu->cpum.GstCtx.bp, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU16Ex(pVCpu, pVCpu->cpum.GstCtx.si, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU16Ex(pVCpu, pVCpu->cpum.GstCtx.di, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rsp = TmpRsp.u;
            rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }
    }
    else
    {
        GCPtrBottom--;
        uint16_t *pa16Mem = NULL;
        rcStrict = iemMemMap(pVCpu, (void **)&pa16Mem, 16, X86_SREG_SS, GCPtrBottom, IEM_ACCESS_STACK_W, sizeof(*pa16Mem) - 1);
        if (rcStrict == VINF_SUCCESS)
        {
            pa16Mem[7 - X86_GREG_xDI] = pVCpu->cpum.GstCtx.di;
            pa16Mem[7 - X86_GREG_xSI] = pVCpu->cpum.GstCtx.si;
            pa16Mem[7 - X86_GREG_xBP] = pVCpu->cpum.GstCtx.bp;
            pa16Mem[7 - X86_GREG_xSP] = pVCpu->cpum.GstCtx.sp;
            pa16Mem[7 - X86_GREG_xBX] = pVCpu->cpum.GstCtx.bx;
            pa16Mem[7 - X86_GREG_xDX] = pVCpu->cpum.GstCtx.dx;
            pa16Mem[7 - X86_GREG_xCX] = pVCpu->cpum.GstCtx.cx;
            pa16Mem[7 - X86_GREG_xAX] = pVCpu->cpum.GstCtx.ax;
            rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)pa16Mem, IEM_ACCESS_STACK_W);
            if (rcStrict == VINF_SUCCESS)
            {
                iemRegSubFromRsp(pVCpu, 16);
                rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
            }
        }
    }
    return rcStrict;
}


/**
 * Implements a 32-bit pusha.
 */
IEM_CIMPL_DEF_0(iemCImpl_pusha_32)
{
    RTGCPTR         GCPtrTop    = iemRegGetEffRsp(pVCpu);
    RTGCPTR         GCPtrBottom = GCPtrTop - 31;
    VBOXSTRICTRC    rcStrict;

    /*
     * The docs are a bit hard to comprehend here, but it looks like we wrap
     * around in real mode as long as none of the individual "pusha" crosses the
     * end of the stack segment.  In protected mode we check the whole access
     * in one go.  For efficiency, only do the word-by-word thing if we're in
     * danger of wrapping around.
     */
    /** @todo do pusha boundary / wrap-around checks.  */
    if (RT_UNLIKELY(   GCPtrBottom > GCPtrTop
                    && IEM_IS_REAL_OR_V86_MODE(pVCpu) ) )
    {
        /* word-by-word */
        RTUINT64U TmpRsp;
        TmpRsp.u = pVCpu->cpum.GstCtx.rsp;
        rcStrict = iemMemStackPushU32Ex(pVCpu, pVCpu->cpum.GstCtx.eax, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU32Ex(pVCpu, pVCpu->cpum.GstCtx.ecx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU32Ex(pVCpu, pVCpu->cpum.GstCtx.edx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU32Ex(pVCpu, pVCpu->cpum.GstCtx.ebx, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU32Ex(pVCpu, pVCpu->cpum.GstCtx.esp, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU32Ex(pVCpu, pVCpu->cpum.GstCtx.ebp, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU32Ex(pVCpu, pVCpu->cpum.GstCtx.esi, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
            rcStrict = iemMemStackPushU32Ex(pVCpu, pVCpu->cpum.GstCtx.edi, &TmpRsp);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rsp = TmpRsp.u;
            rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }
    }
    else
    {
        GCPtrBottom--;
        uint32_t *pa32Mem;
        rcStrict = iemMemMap(pVCpu, (void **)&pa32Mem, 32, X86_SREG_SS, GCPtrBottom, IEM_ACCESS_STACK_W, sizeof(*pa32Mem) - 1);
        if (rcStrict == VINF_SUCCESS)
        {
            pa32Mem[7 - X86_GREG_xDI] = pVCpu->cpum.GstCtx.edi;
            pa32Mem[7 - X86_GREG_xSI] = pVCpu->cpum.GstCtx.esi;
            pa32Mem[7 - X86_GREG_xBP] = pVCpu->cpum.GstCtx.ebp;
            pa32Mem[7 - X86_GREG_xSP] = pVCpu->cpum.GstCtx.esp;
            pa32Mem[7 - X86_GREG_xBX] = pVCpu->cpum.GstCtx.ebx;
            pa32Mem[7 - X86_GREG_xDX] = pVCpu->cpum.GstCtx.edx;
            pa32Mem[7 - X86_GREG_xCX] = pVCpu->cpum.GstCtx.ecx;
            pa32Mem[7 - X86_GREG_xAX] = pVCpu->cpum.GstCtx.eax;
            rcStrict = iemMemCommitAndUnmap(pVCpu, pa32Mem, IEM_ACCESS_STACK_W);
            if (rcStrict == VINF_SUCCESS)
            {
                iemRegSubFromRsp(pVCpu, 32);
                rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
            }
        }
    }
    return rcStrict;
}


/**
 * Implements pushf.
 *
 *
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_1(iemCImpl_pushf, IEMMODE, enmEffOpSize)
{
    VBOXSTRICTRC rcStrict;

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_PUSHF))
    {
        Log2(("pushf: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_PUSHF, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * If we're in V8086 mode some care is required (which is why we're in
     * doing this in a C implementation).
     */
    uint32_t fEfl = IEMMISC_GET_EFL(pVCpu);
    if (   (fEfl & X86_EFL_VM)
        && X86_EFL_GET_IOPL(fEfl) != 3 )
    {
        Assert(pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE);
        if (   enmEffOpSize != IEMMODE_16BIT
            || !(pVCpu->cpum.GstCtx.cr4 & X86_CR4_VME))
            return iemRaiseGeneralProtectionFault0(pVCpu);
        fEfl &= ~X86_EFL_IF;          /* (RF and VM are out of range) */
        fEfl |= (fEfl & X86_EFL_VIF) >> (19 - 9);
        rcStrict = iemMemStackPushU16(pVCpu, (uint16_t)fEfl);
    }
    else
    {

        /*
         * Ok, clear RF and VM, adjust for ancient CPUs, and push the flags.
         */
        fEfl &= ~(X86_EFL_RF | X86_EFL_VM);

        switch (enmEffOpSize)
        {
            case IEMMODE_16BIT:
                AssertCompile(IEMTARGETCPU_8086 <= IEMTARGETCPU_186 && IEMTARGETCPU_V20 <= IEMTARGETCPU_186 && IEMTARGETCPU_286 > IEMTARGETCPU_186);
                if (IEM_GET_TARGET_CPU(pVCpu) <= IEMTARGETCPU_186)
                    fEfl |= UINT16_C(0xf000);
                rcStrict = iemMemStackPushU16(pVCpu, (uint16_t)fEfl);
                break;
            case IEMMODE_32BIT:
                rcStrict = iemMemStackPushU32(pVCpu, fEfl);
                break;
            case IEMMODE_64BIT:
                rcStrict = iemMemStackPushU64(pVCpu, fEfl);
                break;
            IEM_NOT_REACHED_DEFAULT_CASE_RET();
        }
    }

    if (rcStrict == VINF_SUCCESS)
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    return rcStrict;
}


/**
 * Implements popf.
 *
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_1(iemCImpl_popf, IEMMODE, enmEffOpSize)
{
    uint32_t const  fEflOld = IEMMISC_GET_EFL(pVCpu);
    VBOXSTRICTRC    rcStrict;
    uint32_t        fEflNew;

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_POPF))
    {
        Log2(("popf: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_POPF, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * V8086 is special as usual.
     */
    if (fEflOld & X86_EFL_VM)
    {
        /*
         * Almost anything goes if IOPL is 3.
         */
        if (X86_EFL_GET_IOPL(fEflOld) == 3)
        {
            switch (enmEffOpSize)
            {
                case IEMMODE_16BIT:
                {
                    uint16_t u16Value;
                    rcStrict = iemMemStackPopU16(pVCpu, &u16Value);
                    if (rcStrict != VINF_SUCCESS)
                        return rcStrict;
                    fEflNew = u16Value | (fEflOld & UINT32_C(0xffff0000));
                    break;
                }
                case IEMMODE_32BIT:
                    rcStrict = iemMemStackPopU32(pVCpu, &fEflNew);
                    if (rcStrict != VINF_SUCCESS)
                        return rcStrict;
                    break;
                IEM_NOT_REACHED_DEFAULT_CASE_RET();
            }

            const uint32_t fPopfBits = pVCpu->CTX_SUFF(pVM)->cpum.ro.GuestFeatures.enmMicroarch != kCpumMicroarch_Intel_80386
                                     ? X86_EFL_POPF_BITS : X86_EFL_POPF_BITS_386;
            fEflNew &=   fPopfBits & ~(X86_EFL_IOPL);
            fEflNew |= ~(fPopfBits & ~(X86_EFL_IOPL)) & fEflOld;
        }
        /*
         * Interrupt flag virtualization with CR4.VME=1.
         */
        else if (   enmEffOpSize == IEMMODE_16BIT
                 && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_VME) )
        {
            uint16_t    u16Value;
            RTUINT64U   TmpRsp;
            TmpRsp.u = pVCpu->cpum.GstCtx.rsp;
            rcStrict = iemMemStackPopU16Ex(pVCpu, &u16Value, &TmpRsp);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;

            /** @todo Is the popf VME \#GP(0) delivered after updating RSP+RIP
             *        or before? */
            if (    (   (u16Value & X86_EFL_IF)
                     && (fEflOld  & X86_EFL_VIP))
                ||  (u16Value & X86_EFL_TF) )
                return iemRaiseGeneralProtectionFault0(pVCpu);

            fEflNew = u16Value | (fEflOld & UINT32_C(0xffff0000) & ~X86_EFL_VIF);
            fEflNew |= (fEflNew & X86_EFL_IF) << (19 - 9);
            fEflNew &=   X86_EFL_POPF_BITS & ~(X86_EFL_IOPL | X86_EFL_IF);
            fEflNew |= ~(X86_EFL_POPF_BITS & ~(X86_EFL_IOPL | X86_EFL_IF)) & fEflOld;

            pVCpu->cpum.GstCtx.rsp = TmpRsp.u;
        }
        else
            return iemRaiseGeneralProtectionFault0(pVCpu);

    }
    /*
     * Not in V8086 mode.
     */
    else
    {
        /* Pop the flags. */
        switch (enmEffOpSize)
        {
            case IEMMODE_16BIT:
            {
                uint16_t u16Value;
                rcStrict = iemMemStackPopU16(pVCpu, &u16Value);
                if (rcStrict != VINF_SUCCESS)
                    return rcStrict;
                fEflNew = u16Value | (fEflOld & UINT32_C(0xffff0000));

                /*
                 * Ancient CPU adjustments:
                 *  - 8086, 80186, V20/30:
                 *    Fixed bits 15:12 bits are not kept correctly internally, mostly for
                 *    practical reasons (masking below).  We add them when pushing flags.
                 *  - 80286:
                 *    The NT and IOPL flags cannot be popped from real mode and are
                 *    therefore always zero (since a 286 can never exit from PM and
                 *    their initial value is zero).  This changed on a 386 and can
                 *    therefore be used to detect 286 or 386 CPU in real mode.
                 */
                if (   IEM_GET_TARGET_CPU(pVCpu) == IEMTARGETCPU_286
                    && !(pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE) )
                    fEflNew &= ~(X86_EFL_NT | X86_EFL_IOPL);
                break;
            }
            case IEMMODE_32BIT:
                rcStrict = iemMemStackPopU32(pVCpu, &fEflNew);
                if (rcStrict != VINF_SUCCESS)
                    return rcStrict;
                break;
            case IEMMODE_64BIT:
            {
                uint64_t u64Value;
                rcStrict = iemMemStackPopU64(pVCpu, &u64Value);
                if (rcStrict != VINF_SUCCESS)
                    return rcStrict;
                fEflNew = u64Value;  /** @todo testcase: Check exactly what happens if high bits are set. */
                break;
            }
            IEM_NOT_REACHED_DEFAULT_CASE_RET();
        }

        /* Merge them with the current flags. */
        const uint32_t fPopfBits = pVCpu->CTX_SUFF(pVM)->cpum.ro.GuestFeatures.enmMicroarch != kCpumMicroarch_Intel_80386
                                 ? X86_EFL_POPF_BITS : X86_EFL_POPF_BITS_386;
        if (   (fEflNew & (X86_EFL_IOPL | X86_EFL_IF)) == (fEflOld & (X86_EFL_IOPL | X86_EFL_IF))
            || pVCpu->iem.s.uCpl == 0)
        {
            fEflNew &=  fPopfBits;
            fEflNew |= ~fPopfBits & fEflOld;
        }
        else if (pVCpu->iem.s.uCpl <= X86_EFL_GET_IOPL(fEflOld))
        {
            fEflNew &=   fPopfBits & ~(X86_EFL_IOPL);
            fEflNew |= ~(fPopfBits & ~(X86_EFL_IOPL)) & fEflOld;
        }
        else
        {
            fEflNew &=   fPopfBits & ~(X86_EFL_IOPL | X86_EFL_IF);
            fEflNew |= ~(fPopfBits & ~(X86_EFL_IOPL | X86_EFL_IF)) & fEflOld;
        }
    }

    /*
     * Commit the flags.
     */
    Assert(fEflNew & RT_BIT_32(1));
    IEMMISC_SET_EFL(pVCpu, fEflNew);
    return iemRegAddToRipAndFinishingClearingRfEx(pVCpu, cbInstr, fEflOld);
}


/**
 * Implements an indirect call.
 *
 * @param   uNewPC          The new program counter (RIP) value (loaded from the
 *                          operand).
 */
IEM_CIMPL_DEF_1(iemCImpl_call_16, uint16_t, uNewPC)
{
    uint16_t const uOldPC = pVCpu->cpum.GstCtx.ip + cbInstr;
    if (uNewPC <= pVCpu->cpum.GstCtx.cs.u32Limit)
    {
        VBOXSTRICTRC rcStrict = iemMemStackPushU16(pVCpu, uOldPC);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rip = uNewPC;
            IEM_FLUSH_PREFETCH_LIGHT(pVCpu, cbInstr);
            return iemRegFinishClearingRF(pVCpu);
        }
        return rcStrict;
    }
    return iemRaiseGeneralProtectionFault0(pVCpu);
}


/**
 * Implements a 16-bit relative call.
 *
 * @param   offDisp      The displacment offset.
 */
IEM_CIMPL_DEF_1(iemCImpl_call_rel_16, int16_t, offDisp)
{
    uint16_t const uOldPC = pVCpu->cpum.GstCtx.ip + cbInstr;
    uint16_t const uNewPC = uOldPC + offDisp;
    if (uNewPC <= pVCpu->cpum.GstCtx.cs.u32Limit)
    {
        VBOXSTRICTRC rcStrict = iemMemStackPushU16(pVCpu, uOldPC);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rip = uNewPC;
            IEM_FLUSH_PREFETCH_LIGHT(pVCpu, cbInstr);
            return iemRegFinishClearingRF(pVCpu);
        }
        return rcStrict;
    }
    return iemRaiseGeneralProtectionFault0(pVCpu);
}


/**
 * Implements a 32-bit indirect call.
 *
 * @param   uNewPC          The new program counter (RIP) value (loaded from the
 *                          operand).
 */
IEM_CIMPL_DEF_1(iemCImpl_call_32, uint32_t, uNewPC)
{
    uint32_t const uOldPC = pVCpu->cpum.GstCtx.eip + cbInstr;
    if (uNewPC <= pVCpu->cpum.GstCtx.cs.u32Limit)
    {
        VBOXSTRICTRC rcStrict = iemMemStackPushU32(pVCpu, uOldPC);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rip = uNewPC;
            IEM_FLUSH_PREFETCH_LIGHT(pVCpu, cbInstr);
            return iemRegFinishClearingRF(pVCpu);
        }
        return rcStrict;
    }
    return iemRaiseGeneralProtectionFault0(pVCpu);
}


/**
 * Implements a 32-bit relative call.
 *
 * @param   offDisp      The displacment offset.
 */
IEM_CIMPL_DEF_1(iemCImpl_call_rel_32, int32_t, offDisp)
{
    uint32_t const uOldPC = pVCpu->cpum.GstCtx.eip + cbInstr;
    uint32_t const uNewPC = uOldPC + offDisp;
    if (uNewPC <= pVCpu->cpum.GstCtx.cs.u32Limit)
    {
        VBOXSTRICTRC rcStrict = iemMemStackPushU32(pVCpu, uOldPC);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rip = uNewPC;
            IEM_FLUSH_PREFETCH_LIGHT(pVCpu, cbInstr);
            return iemRegFinishClearingRF(pVCpu);
        }
        return rcStrict;
    }
    return iemRaiseGeneralProtectionFault0(pVCpu);
}


/**
 * Implements a 64-bit indirect call.
 *
 * @param   uNewPC          The new program counter (RIP) value (loaded from the
 *                          operand).
 */
IEM_CIMPL_DEF_1(iemCImpl_call_64, uint64_t, uNewPC)
{
    uint64_t const uOldPC = pVCpu->cpum.GstCtx.rip + cbInstr;
    if (IEM_IS_CANONICAL(uNewPC))
    {
        VBOXSTRICTRC rcStrict = iemMemStackPushU64(pVCpu, uOldPC);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rip = uNewPC;
            IEM_FLUSH_PREFETCH_LIGHT(pVCpu, cbInstr);
            return iemRegFinishClearingRF(pVCpu);
        }
        return rcStrict;
    }
    return iemRaiseGeneralProtectionFault0(pVCpu);
}


/**
 * Implements a 64-bit relative call.
 *
 * @param   offDisp      The displacment offset.
 */
IEM_CIMPL_DEF_1(iemCImpl_call_rel_64, int64_t, offDisp)
{
    uint64_t const uOldPC = pVCpu->cpum.GstCtx.rip + cbInstr;
    uint64_t const uNewPC = uOldPC + offDisp;
    if (IEM_IS_CANONICAL(uNewPC))
    {
        VBOXSTRICTRC rcStrict = iemMemStackPushU64(pVCpu, uOldPC);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.rip = uNewPC;
            IEM_FLUSH_PREFETCH_LIGHT(pVCpu, cbInstr);
            return iemRegFinishClearingRF(pVCpu);
        }
        return rcStrict;
    }
    return iemRaiseNotCanonical(pVCpu);
}


/**
 * Implements far jumps and calls thru task segments (TSS).
 *
 * @returns VBox strict status code.
 * @param   pVCpu           The cross context virtual CPU structure of the
 *                          calling thread.
 * @param   cbInstr         The current instruction length.
 * @param   uSel            The selector.
 * @param   enmBranch       The kind of branching we're performing.
 * @param   enmEffOpSize    The effective operand size.
 * @param   pDesc           The descriptor corresponding to @a uSel. The type is
 *                          task gate.
 */
static VBOXSTRICTRC iemCImpl_BranchTaskSegment(PVMCPUCC pVCpu, uint8_t cbInstr, uint16_t uSel, IEMBRANCH enmBranch,
                                               IEMMODE enmEffOpSize, PIEMSELDESC pDesc)
{
#ifndef IEM_IMPLEMENTS_TASKSWITCH
    IEM_RETURN_ASPECT_NOT_IMPLEMENTED();
#else
    Assert(enmBranch == IEMBRANCH_JUMP || enmBranch == IEMBRANCH_CALL);
    Assert(   pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_286_TSS_AVAIL
           || pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_386_TSS_AVAIL);
    RT_NOREF_PV(enmEffOpSize);
    IEM_CTX_ASSERT(pVCpu, IEM_CPUMCTX_EXTRN_XCPT_MASK);

    if (   pDesc->Legacy.Gate.u2Dpl < pVCpu->iem.s.uCpl
        || pDesc->Legacy.Gate.u2Dpl < (uSel & X86_SEL_RPL))
    {
        Log(("BranchTaskSegment invalid priv. uSel=%04x TSS DPL=%d CPL=%u Sel RPL=%u -> #GP\n", uSel, pDesc->Legacy.Gate.u2Dpl,
             pVCpu->iem.s.uCpl, (uSel & X86_SEL_RPL)));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel & X86_SEL_MASK_OFF_RPL);
    }

    /** @todo This is checked earlier for far jumps (see iemCImpl_FarJmp) but not
     *        far calls (see iemCImpl_callf). Most likely in both cases it should be
     *        checked here, need testcases. */
    if (!pDesc->Legacy.Gen.u1Present)
    {
        Log(("BranchTaskSegment TSS not present uSel=%04x -> #NP\n", uSel));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uSel & X86_SEL_MASK_OFF_RPL);
    }

    uint32_t uNextEip = pVCpu->cpum.GstCtx.eip + cbInstr;
    return iemTaskSwitch(pVCpu, enmBranch == IEMBRANCH_JUMP ? IEMTASKSWITCH_JUMP : IEMTASKSWITCH_CALL,
                         uNextEip, 0 /* fFlags */, 0 /* uErr */, 0 /* uCr2 */, uSel, pDesc);
#endif
}


/**
 * Implements far jumps and calls thru task gates.
 *
 * @returns VBox strict status code.
 * @param   pVCpu           The cross context virtual CPU structure of the
 *                          calling thread.
 * @param   cbInstr         The current instruction length.
 * @param   uSel            The selector.
 * @param   enmBranch       The kind of branching we're performing.
 * @param   enmEffOpSize    The effective operand size.
 * @param   pDesc           The descriptor corresponding to @a uSel. The type is
 *                          task gate.
 */
static VBOXSTRICTRC iemCImpl_BranchTaskGate(PVMCPUCC pVCpu, uint8_t cbInstr, uint16_t uSel, IEMBRANCH enmBranch,
                                            IEMMODE enmEffOpSize, PIEMSELDESC pDesc)
{
#ifndef IEM_IMPLEMENTS_TASKSWITCH
    IEM_RETURN_ASPECT_NOT_IMPLEMENTED();
#else
    Assert(enmBranch == IEMBRANCH_JUMP || enmBranch == IEMBRANCH_CALL);
    RT_NOREF_PV(enmEffOpSize);
    IEM_CTX_ASSERT(pVCpu, IEM_CPUMCTX_EXTRN_XCPT_MASK);

    if (   pDesc->Legacy.Gate.u2Dpl < pVCpu->iem.s.uCpl
        || pDesc->Legacy.Gate.u2Dpl < (uSel & X86_SEL_RPL))
    {
        Log(("BranchTaskGate invalid priv. uSel=%04x TSS DPL=%d CPL=%u Sel RPL=%u -> #GP\n", uSel, pDesc->Legacy.Gate.u2Dpl,
             pVCpu->iem.s.uCpl, (uSel & X86_SEL_RPL)));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel & X86_SEL_MASK_OFF_RPL);
    }

    /** @todo This is checked earlier for far jumps (see iemCImpl_FarJmp) but not
     *        far calls (see iemCImpl_callf). Most likely in both cases it should be
     *        checked here, need testcases. */
    if (!pDesc->Legacy.Gen.u1Present)
    {
        Log(("BranchTaskSegment segment not present uSel=%04x -> #NP\n", uSel));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uSel & X86_SEL_MASK_OFF_RPL);
    }

    /*
     * Fetch the new TSS descriptor from the GDT.
     */
    RTSEL uSelTss = pDesc->Legacy.Gate.u16Sel;
    if (uSelTss  & X86_SEL_LDT)
    {
        Log(("BranchTaskGate TSS is in LDT. uSel=%04x uSelTss=%04x -> #GP\n", uSel, uSelTss));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel & X86_SEL_MASK_OFF_RPL);
    }

    IEMSELDESC TssDesc;
    VBOXSTRICTRC rcStrict = iemMemFetchSelDesc(pVCpu, &TssDesc, uSelTss, X86_XCPT_GP);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    if (TssDesc.Legacy.Gate.u4Type & X86_SEL_TYPE_SYS_TSS_BUSY_MASK)
    {
        Log(("BranchTaskGate TSS is busy. uSel=%04x uSelTss=%04x DescType=%#x -> #GP\n", uSel, uSelTss,
             TssDesc.Legacy.Gate.u4Type));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel & X86_SEL_MASK_OFF_RPL);
    }

    if (!TssDesc.Legacy.Gate.u1Present)
    {
        Log(("BranchTaskGate TSS is not present. uSel=%04x uSelTss=%04x -> #NP\n", uSel, uSelTss));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uSelTss & X86_SEL_MASK_OFF_RPL);
    }

    uint32_t uNextEip = pVCpu->cpum.GstCtx.eip + cbInstr;
    return iemTaskSwitch(pVCpu, enmBranch == IEMBRANCH_JUMP ? IEMTASKSWITCH_JUMP : IEMTASKSWITCH_CALL,
                         uNextEip, 0 /* fFlags */, 0 /* uErr */, 0 /* uCr2 */, uSelTss, &TssDesc);
#endif
}


/**
 * Implements far jumps and calls thru call gates.
 *
 * @returns VBox strict status code.
 * @param   pVCpu           The cross context virtual CPU structure of the
 *                          calling thread.
 * @param   cbInstr         The current instruction length.
 * @param   uSel            The selector.
 * @param   enmBranch       The kind of branching we're performing.
 * @param   enmEffOpSize    The effective operand size.
 * @param   pDesc           The descriptor corresponding to @a uSel. The type is
 *                          call gate.
 */
static VBOXSTRICTRC iemCImpl_BranchCallGate(PVMCPUCC pVCpu, uint8_t cbInstr, uint16_t uSel, IEMBRANCH enmBranch,
                                            IEMMODE enmEffOpSize, PIEMSELDESC pDesc)
{
#define IEM_IMPLEMENTS_CALLGATE
#ifndef IEM_IMPLEMENTS_CALLGATE
    IEM_RETURN_ASPECT_NOT_IMPLEMENTED();
#else
    RT_NOREF_PV(enmEffOpSize);
    IEM_CTX_ASSERT(pVCpu, IEM_CPUMCTX_EXTRN_XCPT_MASK);

    /* NB: Far jumps can only do intra-privilege transfers. Far calls support
     * inter-privilege calls and are much more complex.
     *
     * NB: 64-bit call gate has the same type as a 32-bit call gate! If
     * EFER.LMA=1, the gate must be 64-bit. Conversely if EFER.LMA=0, the gate
     * must be 16-bit or 32-bit.
     */
    /** @todo effective operand size is probably irrelevant here, only the
     *        call gate bitness matters??
     */
    VBOXSTRICTRC    rcStrict;
    RTPTRUNION      uPtrRet;
    uint64_t        uNewRsp;
    uint64_t        uNewRip;
    uint64_t        u64Base;
    uint32_t        cbLimit;
    RTSEL           uNewCS;
    IEMSELDESC      DescCS;

    AssertCompile(X86_SEL_TYPE_SYS_386_CALL_GATE == AMD64_SEL_TYPE_SYS_CALL_GATE);
    Assert(enmBranch == IEMBRANCH_JUMP || enmBranch == IEMBRANCH_CALL);
    Assert(   pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_286_CALL_GATE
           || pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_386_CALL_GATE);

    /* Determine the new instruction pointer from the gate descriptor. */
    uNewRip = pDesc->Legacy.Gate.u16OffsetLow
            | ((uint32_t)pDesc->Legacy.Gate.u16OffsetHigh << 16)
            | ((uint64_t)pDesc->Long.Gate.u32OffsetTop    << 32);

    /* Perform DPL checks on the gate descriptor. */
    if (   pDesc->Legacy.Gate.u2Dpl < pVCpu->iem.s.uCpl
        || pDesc->Legacy.Gate.u2Dpl < (uSel & X86_SEL_RPL))
    {
        Log(("BranchCallGate invalid priv. uSel=%04x Gate DPL=%d CPL=%u Sel RPL=%u -> #GP\n", uSel, pDesc->Legacy.Gate.u2Dpl,
             pVCpu->iem.s.uCpl, (uSel & X86_SEL_RPL)));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
    }

    /** @todo does this catch NULL selectors, too? */
    if (!pDesc->Legacy.Gen.u1Present)
    {
        Log(("BranchCallGate Gate not present uSel=%04x -> #NP\n", uSel));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uSel);
    }

    /*
     * Fetch the target CS descriptor from the GDT or LDT.
     */
    uNewCS = pDesc->Legacy.Gate.u16Sel;
    rcStrict = iemMemFetchSelDesc(pVCpu, &DescCS, uNewCS, X86_XCPT_GP);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Target CS must be a code selector. */
    if (   !DescCS.Legacy.Gen.u1DescType
        || !(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_CODE) )
    {
        Log(("BranchCallGate %04x:%08RX64 -> not a code selector (u1DescType=%u u4Type=%#x).\n",
             uNewCS, uNewRip, DescCS.Legacy.Gen.u1DescType, DescCS.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCS);
    }

    /* Privilege checks on target CS. */
    if (enmBranch == IEMBRANCH_JUMP)
    {
        if (DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_CONF)
        {
            if (DescCS.Legacy.Gen.u2Dpl > pVCpu->iem.s.uCpl)
            {
                Log(("BranchCallGate jump (conforming) bad DPL uNewCS=%04x Gate DPL=%d CPL=%u -> #GP\n",
                     uNewCS, DescCS.Legacy.Gen.u2Dpl, pVCpu->iem.s.uCpl));
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCS);
            }
        }
        else
        {
            if (DescCS.Legacy.Gen.u2Dpl != pVCpu->iem.s.uCpl)
            {
                Log(("BranchCallGate jump (non-conforming) bad DPL uNewCS=%04x Gate DPL=%d CPL=%u -> #GP\n",
                     uNewCS, DescCS.Legacy.Gen.u2Dpl, pVCpu->iem.s.uCpl));
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCS);
            }
        }
    }
    else
    {
        Assert(enmBranch == IEMBRANCH_CALL);
        if (DescCS.Legacy.Gen.u2Dpl > pVCpu->iem.s.uCpl)
        {
            Log(("BranchCallGate call invalid priv. uNewCS=%04x Gate DPL=%d CPL=%u -> #GP\n",
                 uNewCS, DescCS.Legacy.Gen.u2Dpl, pVCpu->iem.s.uCpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCS & X86_SEL_MASK_OFF_RPL);
        }
    }

    /* Additional long mode checks. */
    if (IEM_IS_LONG_MODE(pVCpu))
    {
        if (!DescCS.Legacy.Gen.u1Long)
        {
            Log(("BranchCallGate uNewCS %04x -> not a 64-bit code segment.\n", uNewCS));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCS);
        }

        /* L vs D. */
        if (   DescCS.Legacy.Gen.u1Long
            && DescCS.Legacy.Gen.u1DefBig)
        {
            Log(("BranchCallGate uNewCS %04x -> both L and D are set.\n", uNewCS));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCS);
        }
    }

    if (!DescCS.Legacy.Gate.u1Present)
    {
        Log(("BranchCallGate target CS is not present. uSel=%04x uNewCS=%04x -> #NP(CS)\n", uSel, uNewCS));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uNewCS);
    }

    if (enmBranch == IEMBRANCH_JUMP)
    {
        /** @todo This is very similar to regular far jumps; merge! */
        /* Jumps are fairly simple... */

        /* Chop the high bits off if 16-bit gate (Intel says so). */
        if (pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_286_CALL_GATE)
            uNewRip = (uint16_t)uNewRip;

        /* Limit check for non-long segments. */
        cbLimit = X86DESC_LIMIT_G(&DescCS.Legacy);
        if (DescCS.Legacy.Gen.u1Long)
            u64Base = 0;
        else
        {
            if (uNewRip > cbLimit)
            {
                Log(("BranchCallGate jump %04x:%08RX64 -> out of bounds (%#x) -> #GP(0)\n", uNewCS, uNewRip, cbLimit));
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, 0);
            }
            u64Base = X86DESC_BASE(&DescCS.Legacy);
        }

        /* Canonical address check. */
        if (!IEM_IS_CANONICAL(uNewRip))
        {
            Log(("BranchCallGate jump %04x:%016RX64 - not canonical -> #GP\n", uNewCS, uNewRip));
            return iemRaiseNotCanonical(pVCpu);
        }

        /*
         * Ok, everything checked out fine.  Now set the accessed bit before
         * committing the result into CS, CSHID and RIP.
         */
        if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
        {
            rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewCS);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
            /** @todo check what VT-x and AMD-V does. */
            DescCS.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
        }

        /* commit */
        pVCpu->cpum.GstCtx.rip         = uNewRip;
        pVCpu->cpum.GstCtx.cs.Sel      = uNewCS & X86_SEL_MASK_OFF_RPL;
        pVCpu->cpum.GstCtx.cs.Sel     |= pVCpu->iem.s.uCpl; /** @todo is this right for conforming segs? or in general? */
        pVCpu->cpum.GstCtx.cs.ValidSel = pVCpu->cpum.GstCtx.cs.Sel;
        pVCpu->cpum.GstCtx.cs.fFlags   = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.cs.Attr.u   = X86DESC_GET_HID_ATTR(&DescCS.Legacy);
        pVCpu->cpum.GstCtx.cs.u32Limit = cbLimit;
        pVCpu->cpum.GstCtx.cs.u64Base  = u64Base;
        pVCpu->iem.s.enmCpuMode = iemCalcCpuMode(pVCpu);
    }
    else
    {
        Assert(enmBranch == IEMBRANCH_CALL);
        /* Calls are much more complicated. */

        if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_CONF) && (DescCS.Legacy.Gen.u2Dpl < pVCpu->iem.s.uCpl))
        {
            uint16_t    offNewStack;    /* Offset of new stack in TSS. */
            uint16_t    cbNewStack;     /* Number of bytes the stack information takes up in TSS. */
            uint8_t     uNewCSDpl;
            uint8_t     cbWords;
            RTSEL       uNewSS;
            RTSEL       uOldSS;
            uint64_t    uOldRsp;
            IEMSELDESC  DescSS;
            RTPTRUNION  uPtrTSS;
            RTGCPTR     GCPtrTSS;
            RTPTRUNION  uPtrParmWds;
            RTGCPTR     GCPtrParmWds;

            /* More privilege. This is the fun part. */
            Assert(!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_CONF));    /* Filtered out above. */

            /*
             * Determine new SS:rSP from the TSS.
             */
            Assert(!pVCpu->cpum.GstCtx.tr.Attr.n.u1DescType);

            /* Figure out where the new stack pointer is stored in the TSS. */
            uNewCSDpl = DescCS.Legacy.Gen.u2Dpl;
            if (!IEM_IS_LONG_MODE(pVCpu))
            {
                if (pVCpu->cpum.GstCtx.tr.Attr.n.u4Type == X86_SEL_TYPE_SYS_386_TSS_BUSY)
                {
                    offNewStack = RT_UOFFSETOF(X86TSS32, esp0) + uNewCSDpl * 8;
                    cbNewStack  = RT_SIZEOFMEMB(X86TSS32, esp0) + RT_SIZEOFMEMB(X86TSS32, ss0);
                }
                else
                {
                    Assert(pVCpu->cpum.GstCtx.tr.Attr.n.u4Type == X86_SEL_TYPE_SYS_286_TSS_BUSY);
                    offNewStack = RT_UOFFSETOF(X86TSS16, sp0) + uNewCSDpl * 4;
                    cbNewStack  = RT_SIZEOFMEMB(X86TSS16, sp0) + RT_SIZEOFMEMB(X86TSS16, ss0);
                }
            }
            else
            {
                Assert(pVCpu->cpum.GstCtx.tr.Attr.n.u4Type == AMD64_SEL_TYPE_SYS_TSS_BUSY);
                offNewStack = RT_UOFFSETOF(X86TSS64, rsp0) + uNewCSDpl * RT_SIZEOFMEMB(X86TSS64, rsp0);
                cbNewStack  = RT_SIZEOFMEMB(X86TSS64, rsp0);
            }

            /* Check against TSS limit. */
            if ((uint16_t)(offNewStack + cbNewStack - 1) > pVCpu->cpum.GstCtx.tr.u32Limit)
            {
                Log(("BranchCallGate inner stack past TSS limit - %u > %u -> #TS(TSS)\n", offNewStack + cbNewStack - 1, pVCpu->cpum.GstCtx.tr.u32Limit));
                return iemRaiseTaskSwitchFaultBySelector(pVCpu, pVCpu->cpum.GstCtx.tr.Sel);
            }

            GCPtrTSS = pVCpu->cpum.GstCtx.tr.u64Base + offNewStack;
            rcStrict = iemMemMap(pVCpu, &uPtrTSS.pv, cbNewStack, UINT8_MAX, GCPtrTSS, IEM_ACCESS_SYS_R, 0);
            if (rcStrict != VINF_SUCCESS)
            {
                Log(("BranchCallGate: TSS mapping failed (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
                return rcStrict;
            }

            if (!IEM_IS_LONG_MODE(pVCpu))
            {
                if (pVCpu->cpum.GstCtx.tr.Attr.n.u4Type == X86_SEL_TYPE_SYS_386_TSS_BUSY)
                {
                    uNewRsp = uPtrTSS.pu32[0];
                    uNewSS  = uPtrTSS.pu16[2];
                }
                else
                {
                    Assert(pVCpu->cpum.GstCtx.tr.Attr.n.u4Type == X86_SEL_TYPE_SYS_286_TSS_BUSY);
                    uNewRsp = uPtrTSS.pu16[0];
                    uNewSS  = uPtrTSS.pu16[1];
                }
            }
            else
            {
                Assert(pVCpu->cpum.GstCtx.tr.Attr.n.u4Type == AMD64_SEL_TYPE_SYS_TSS_BUSY);
                /* SS will be a NULL selector, but that's valid. */
                uNewRsp = uPtrTSS.pu64[0];
                uNewSS  = uNewCSDpl;
            }

            /* Done with the TSS now. */
            rcStrict = iemMemCommitAndUnmap(pVCpu, uPtrTSS.pv, IEM_ACCESS_SYS_R);
            if (rcStrict != VINF_SUCCESS)
            {
                Log(("BranchCallGate: TSS unmapping failed (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
                return rcStrict;
            }

            /* Only used outside of long mode. */
            cbWords = pDesc->Legacy.Gate.u5ParmCount;

            /* If EFER.LMA is 0, there's extra work to do. */
            if (!IEM_IS_LONG_MODE(pVCpu))
            {
                if ((uNewSS & X86_SEL_MASK_OFF_RPL) == 0)
                {
                    Log(("BranchCallGate new SS NULL -> #TS(NewSS)\n"));
                    return iemRaiseTaskSwitchFaultBySelector(pVCpu, uNewSS);
                }

                /* Grab the new SS descriptor. */
                rcStrict = iemMemFetchSelDesc(pVCpu, &DescSS, uNewSS, X86_XCPT_SS);
                if (rcStrict != VINF_SUCCESS)
                    return rcStrict;

                /* Ensure that CS.DPL == SS.RPL == SS.DPL. */
                if (   (DescCS.Legacy.Gen.u2Dpl != (uNewSS & X86_SEL_RPL))
                    || (DescCS.Legacy.Gen.u2Dpl != DescSS.Legacy.Gen.u2Dpl))
                {
                    Log(("BranchCallGate call bad RPL/DPL uNewSS=%04x SS DPL=%d CS DPL=%u -> #TS(NewSS)\n",
                         uNewSS, DescCS.Legacy.Gen.u2Dpl, DescCS.Legacy.Gen.u2Dpl));
                    return iemRaiseTaskSwitchFaultBySelector(pVCpu, uNewSS);
                }

                /* Ensure new SS is a writable data segment. */
                if ((DescSS.Legacy.Gen.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_WRITE)) != X86_SEL_TYPE_WRITE)
                {
                    Log(("BranchCallGate call new SS -> not a writable data selector (u4Type=%#x)\n", DescSS.Legacy.Gen.u4Type));
                    return iemRaiseTaskSwitchFaultBySelector(pVCpu, uNewSS);
                }

                if (!DescSS.Legacy.Gen.u1Present)
                {
                    Log(("BranchCallGate New stack not present uSel=%04x -> #SS(NewSS)\n", uNewSS));
                    return iemRaiseStackSelectorNotPresentBySelector(pVCpu, uNewSS);
                }
                if (pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_386_CALL_GATE)
                    cbNewStack = (uint16_t)sizeof(uint32_t) * (4 + cbWords);
                else
                    cbNewStack = (uint16_t)sizeof(uint16_t) * (4 + cbWords);
            }
            else
            {
                /* Just grab the new (NULL) SS descriptor. */
                /** @todo testcase: Check whether the zero GDT entry is actually loaded here
                 *        like we do... */
                rcStrict = iemMemFetchSelDesc(pVCpu, &DescSS, uNewSS, X86_XCPT_SS);
                if (rcStrict != VINF_SUCCESS)
                    return rcStrict;

                cbNewStack = sizeof(uint64_t) * 4;
            }

            /** @todo According to Intel, new stack is checked for enough space first,
             *        then switched. According to AMD, the stack is switched first and
             *        then pushes might fault!
             *        NB: OS/2 Warp 3/4 actively relies on the fact that possible
             *        incoming stack \#PF happens before actual stack switch. AMD is
             *        either lying or implicitly assumes that new state is committed
             *        only if and when an instruction doesn't fault.
             */

            /** @todo According to AMD, CS is loaded first, then SS.
             *        According to Intel, it's the other way around!?
             */

            /** @todo Intel and AMD disagree on when exactly the CPL changes! */

            /* Set the accessed bit before committing new SS. */
            if (!(DescSS.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
            {
                rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewSS);
                if (rcStrict != VINF_SUCCESS)
                    return rcStrict;
                DescSS.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
            }

            /* Remember the old SS:rSP and their linear address. */
            uOldSS  = pVCpu->cpum.GstCtx.ss.Sel;
            uOldRsp = pVCpu->cpum.GstCtx.ss.Attr.n.u1DefBig ? pVCpu->cpum.GstCtx.rsp : pVCpu->cpum.GstCtx.sp;

            GCPtrParmWds = pVCpu->cpum.GstCtx.ss.u64Base + uOldRsp;

            /* HACK ALERT! Probe if the write to the new stack will succeed. May #SS(NewSS)
                           or #PF, the former is not implemented in this workaround. */
            /** @todo Proper fix callgate target stack exceptions. */
            /** @todo testcase: Cover callgates with partially or fully inaccessible
             *        target stacks. */
            void    *pvNewFrame;
            RTGCPTR  GCPtrNewStack = X86DESC_BASE(&DescSS.Legacy) + uNewRsp - cbNewStack;
            rcStrict = iemMemMap(pVCpu, &pvNewFrame, cbNewStack, UINT8_MAX, GCPtrNewStack, IEM_ACCESS_SYS_RW, 0);
            if (rcStrict != VINF_SUCCESS)
            {
                Log(("BranchCallGate: Incoming stack (%04x:%08RX64) not accessible, rc=%Rrc\n", uNewSS, uNewRsp, VBOXSTRICTRC_VAL(rcStrict)));
                return rcStrict;
            }
            rcStrict = iemMemCommitAndUnmap(pVCpu, pvNewFrame, IEM_ACCESS_SYS_RW);
            if (rcStrict != VINF_SUCCESS)
            {
                Log(("BranchCallGate: New stack probe unmapping failed (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
                return rcStrict;
            }

            /* Commit new SS:rSP. */
            pVCpu->cpum.GstCtx.ss.Sel      = uNewSS;
            pVCpu->cpum.GstCtx.ss.ValidSel = uNewSS;
            pVCpu->cpum.GstCtx.ss.Attr.u   = X86DESC_GET_HID_ATTR(&DescSS.Legacy);
            pVCpu->cpum.GstCtx.ss.u32Limit = X86DESC_LIMIT_G(&DescSS.Legacy);
            pVCpu->cpum.GstCtx.ss.u64Base  = X86DESC_BASE(&DescSS.Legacy);
            pVCpu->cpum.GstCtx.ss.fFlags   = CPUMSELREG_FLAGS_VALID;
            pVCpu->cpum.GstCtx.rsp         = uNewRsp;
            pVCpu->iem.s.uCpl = uNewCSDpl; /** @todo is the parameter words accessed using the new CPL or the old CPL? */
            Assert(CPUMSELREG_ARE_HIDDEN_PARTS_VALID(pVCpu, &pVCpu->cpum.GstCtx.ss));
            CPUMSetChangedFlags(pVCpu, CPUM_CHANGED_HIDDEN_SEL_REGS);

            /* At this point the stack access must not fail because new state was already committed. */
            /** @todo this can still fail due to SS.LIMIT not check.   */
            rcStrict = iemMemStackPushBeginSpecial(pVCpu, cbNewStack,
                                                   IEM_IS_LONG_MODE(pVCpu) ? 7
                                                   : pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_386_CALL_GATE ? 3 : 1,
                                                   &uPtrRet.pv, &uNewRsp);
            AssertMsgReturn(rcStrict == VINF_SUCCESS, ("BranchCallGate: New stack mapping failed (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)),
                            VERR_INTERNAL_ERROR_5);

            if (!IEM_IS_LONG_MODE(pVCpu))
            {
                if (pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_386_CALL_GATE)
                {
                    if (cbWords)
                    {
                        /* Map the relevant chunk of the old stack. */
                        rcStrict = iemMemMap(pVCpu, &uPtrParmWds.pv, cbWords * 4, UINT8_MAX, GCPtrParmWds,
                                             IEM_ACCESS_DATA_R, 0 /** @todo Can uNewCSDpl == 3? Then we need alignment mask here! */);
                        if (rcStrict != VINF_SUCCESS)
                        {
                            Log(("BranchCallGate: Old stack mapping (32-bit) failed (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
                            return rcStrict;
                        }

                        /* Copy the parameter (d)words. */
                        for (int i = 0; i < cbWords; ++i)
                            uPtrRet.pu32[2 + i] = uPtrParmWds.pu32[i];

                        /* Unmap the old stack. */
                        rcStrict = iemMemCommitAndUnmap(pVCpu, uPtrParmWds.pv, IEM_ACCESS_DATA_R);
                        if (rcStrict != VINF_SUCCESS)
                        {
                            Log(("BranchCallGate: Old stack unmapping (32-bit) failed (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
                            return rcStrict;
                        }
                    }

                    /* Push the old CS:rIP. */
                    uPtrRet.pu32[0] = pVCpu->cpum.GstCtx.eip + cbInstr;
                    uPtrRet.pu32[1] = pVCpu->cpum.GstCtx.cs.Sel; /** @todo Testcase: What is written to the high word when pushing CS? */

                    /* Push the old SS:rSP. */
                    uPtrRet.pu32[2 + cbWords + 0] = uOldRsp;
                    uPtrRet.pu32[2 + cbWords + 1] = uOldSS;
                }
                else
                {
                    Assert(pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_286_CALL_GATE);

                    if (cbWords)
                    {
                        /* Map the relevant chunk of the old stack. */
                        rcStrict = iemMemMap(pVCpu, &uPtrParmWds.pv, cbWords * 2, UINT8_MAX, GCPtrParmWds,
                                             IEM_ACCESS_DATA_R, 0 /** @todo Can uNewCSDpl == 3? Then we need alignment mask here! */);
                        if (rcStrict != VINF_SUCCESS)
                        {
                            Log(("BranchCallGate: Old stack mapping (16-bit) failed (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
                            return rcStrict;
                        }

                        /* Copy the parameter words. */
                        for (int i = 0; i < cbWords; ++i)
                            uPtrRet.pu16[2 + i] = uPtrParmWds.pu16[i];

                        /* Unmap the old stack. */
                        rcStrict = iemMemCommitAndUnmap(pVCpu, uPtrParmWds.pv, IEM_ACCESS_DATA_R);
                        if (rcStrict != VINF_SUCCESS)
                        {
                            Log(("BranchCallGate: Old stack unmapping (32-bit) failed (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
                            return rcStrict;
                        }
                    }

                    /* Push the old CS:rIP. */
                    uPtrRet.pu16[0] = pVCpu->cpum.GstCtx.ip + cbInstr;
                    uPtrRet.pu16[1] = pVCpu->cpum.GstCtx.cs.Sel;

                    /* Push the old SS:rSP. */
                    uPtrRet.pu16[2 + cbWords + 0] = uOldRsp;
                    uPtrRet.pu16[2 + cbWords + 1] = uOldSS;
                }
            }
            else
            {
                Assert(pDesc->Legacy.Gate.u4Type == AMD64_SEL_TYPE_SYS_CALL_GATE);

                /* For 64-bit gates, no parameters are copied. Just push old SS:rSP and CS:rIP. */
                uPtrRet.pu64[0] = pVCpu->cpum.GstCtx.rip + cbInstr;
                uPtrRet.pu64[1] = pVCpu->cpum.GstCtx.cs.Sel; /** @todo Testcase: What is written to the high words when pushing CS? */
                uPtrRet.pu64[2] = uOldRsp;
                uPtrRet.pu64[3] = uOldSS;       /** @todo Testcase: What is written to the high words when pushing SS? */
            }

            rcStrict = iemMemStackPushCommitSpecial(pVCpu, uPtrRet.pv, uNewRsp);
            if (rcStrict != VINF_SUCCESS)
            {
                Log(("BranchCallGate: New stack unmapping failed (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict)));
                return rcStrict;
            }

            /* Chop the high bits off if 16-bit gate (Intel says so). */
            if (pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_286_CALL_GATE)
                uNewRip = (uint16_t)uNewRip;

            /* Limit / canonical check. */
            cbLimit = X86DESC_LIMIT_G(&DescCS.Legacy);
            if (!IEM_IS_LONG_MODE(pVCpu))
            {
                if (uNewRip > cbLimit)
                {
                    Log(("BranchCallGate %04x:%08RX64 -> out of bounds (%#x)\n", uNewCS, uNewRip, cbLimit));
                    return iemRaiseGeneralProtectionFaultBySelector(pVCpu, 0);
                }
                u64Base = X86DESC_BASE(&DescCS.Legacy);
            }
            else
            {
                Assert(pDesc->Legacy.Gate.u4Type == AMD64_SEL_TYPE_SYS_CALL_GATE);
                if (!IEM_IS_CANONICAL(uNewRip))
                {
                    Log(("BranchCallGate call %04x:%016RX64 - not canonical -> #GP\n", uNewCS, uNewRip));
                    return iemRaiseNotCanonical(pVCpu);
                }
                u64Base = 0;
            }

            /*
             * Now set the accessed bit before
             * writing the return address to the stack and committing the result into
             * CS, CSHID and RIP.
             */
            /** @todo Testcase: Need to check WHEN exactly the accessed bit is set. */
            if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
            {
                rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewCS);
                if (rcStrict != VINF_SUCCESS)
                    return rcStrict;
                /** @todo check what VT-x and AMD-V does. */
                DescCS.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
            }

            /* Commit new CS:rIP. */
            pVCpu->cpum.GstCtx.rip         = uNewRip;
            pVCpu->cpum.GstCtx.cs.Sel      = uNewCS & X86_SEL_MASK_OFF_RPL;
            pVCpu->cpum.GstCtx.cs.Sel     |= pVCpu->iem.s.uCpl;
            pVCpu->cpum.GstCtx.cs.ValidSel = pVCpu->cpum.GstCtx.cs.Sel;
            pVCpu->cpum.GstCtx.cs.fFlags   = CPUMSELREG_FLAGS_VALID;
            pVCpu->cpum.GstCtx.cs.Attr.u   = X86DESC_GET_HID_ATTR(&DescCS.Legacy);
            pVCpu->cpum.GstCtx.cs.u32Limit = cbLimit;
            pVCpu->cpum.GstCtx.cs.u64Base  = u64Base;
            pVCpu->iem.s.enmCpuMode = iemCalcCpuMode(pVCpu);
        }
        else
        {
            /* Same privilege. */
            /** @todo This is very similar to regular far calls; merge! */

            /* Check stack first - may #SS(0). */
            /** @todo check how gate size affects pushing of CS! Does callf 16:32 in
             *        16-bit code cause a two or four byte CS to be pushed? */
            rcStrict = iemMemStackPushBeginSpecial(pVCpu,
                                                   IEM_IS_LONG_MODE(pVCpu) ? 8+8
                                                   : pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_386_CALL_GATE ? 4+4 : 2+2,
                                                   IEM_IS_LONG_MODE(pVCpu) ? 7
                                                   : pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_386_CALL_GATE ? 3 : 2,
                                                   &uPtrRet.pv, &uNewRsp);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;

            /* Chop the high bits off if 16-bit gate (Intel says so). */
            if (pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_286_CALL_GATE)
                uNewRip = (uint16_t)uNewRip;

            /* Limit / canonical check. */
            cbLimit = X86DESC_LIMIT_G(&DescCS.Legacy);
            if (!IEM_IS_LONG_MODE(pVCpu))
            {
                if (uNewRip > cbLimit)
                {
                    Log(("BranchCallGate %04x:%08RX64 -> out of bounds (%#x)\n", uNewCS, uNewRip, cbLimit));
                    return iemRaiseGeneralProtectionFaultBySelector(pVCpu, 0);
                }
                u64Base = X86DESC_BASE(&DescCS.Legacy);
            }
            else
            {
                if (!IEM_IS_CANONICAL(uNewRip))
                {
                    Log(("BranchCallGate call %04x:%016RX64 - not canonical -> #GP\n", uNewCS, uNewRip));
                    return iemRaiseNotCanonical(pVCpu);
                }
                u64Base = 0;
            }

            /*
             * Now set the accessed bit before
             * writing the return address to the stack and committing the result into
             * CS, CSHID and RIP.
             */
            /** @todo Testcase: Need to check WHEN exactly the accessed bit is set. */
            if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
            {
                rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewCS);
                if (rcStrict != VINF_SUCCESS)
                    return rcStrict;
                /** @todo check what VT-x and AMD-V does. */
                DescCS.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
            }

            /* stack */
            if (!IEM_IS_LONG_MODE(pVCpu))
            {
                if (pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_386_CALL_GATE)
                {
                    uPtrRet.pu32[0] = pVCpu->cpum.GstCtx.eip + cbInstr;
                    uPtrRet.pu32[1] = pVCpu->cpum.GstCtx.cs.Sel; /** @todo Testcase: What is written to the high word when pushing CS? */
                }
                else
                {
                    Assert(pDesc->Legacy.Gate.u4Type == X86_SEL_TYPE_SYS_286_CALL_GATE);
                    uPtrRet.pu16[0] = pVCpu->cpum.GstCtx.ip + cbInstr;
                    uPtrRet.pu16[1] = pVCpu->cpum.GstCtx.cs.Sel;
                }
            }
            else
            {
                Assert(pDesc->Legacy.Gate.u4Type == AMD64_SEL_TYPE_SYS_CALL_GATE);
                uPtrRet.pu64[0] = pVCpu->cpum.GstCtx.rip + cbInstr;
                uPtrRet.pu64[1] = pVCpu->cpum.GstCtx.cs.Sel; /** @todo Testcase: What is written to the high words when pushing CS? */
            }

            rcStrict = iemMemStackPushCommitSpecial(pVCpu, uPtrRet.pv, uNewRsp);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;

            /* commit */
            pVCpu->cpum.GstCtx.rip         = uNewRip;
            pVCpu->cpum.GstCtx.cs.Sel      = uNewCS & X86_SEL_MASK_OFF_RPL;
            pVCpu->cpum.GstCtx.cs.Sel     |= pVCpu->iem.s.uCpl;
            pVCpu->cpum.GstCtx.cs.ValidSel = pVCpu->cpum.GstCtx.cs.Sel;
            pVCpu->cpum.GstCtx.cs.fFlags   = CPUMSELREG_FLAGS_VALID;
            pVCpu->cpum.GstCtx.cs.Attr.u   = X86DESC_GET_HID_ATTR(&DescCS.Legacy);
            pVCpu->cpum.GstCtx.cs.u32Limit = cbLimit;
            pVCpu->cpum.GstCtx.cs.u64Base  = u64Base;
            pVCpu->iem.s.enmCpuMode  = iemCalcCpuMode(pVCpu);
        }
    }
    pVCpu->cpum.GstCtx.eflags.Bits.u1RF = 0;
/** @todo single stepping   */

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr);
    return VINF_SUCCESS;
#endif /* IEM_IMPLEMENTS_CALLGATE */
}


/**
 * Implements far jumps and calls thru system selectors.
 *
 * @returns VBox strict status code.
 * @param   pVCpu           The cross context virtual CPU structure of the
 *                          calling thread.
 * @param   cbInstr         The current instruction length.
 * @param   uSel            The selector.
 * @param   enmBranch       The kind of branching we're performing.
 * @param   enmEffOpSize    The effective operand size.
 * @param   pDesc           The descriptor corresponding to @a uSel.
 */
static VBOXSTRICTRC iemCImpl_BranchSysSel(PVMCPUCC pVCpu, uint8_t cbInstr, uint16_t uSel, IEMBRANCH enmBranch,
                                          IEMMODE enmEffOpSize, PIEMSELDESC pDesc)
{
    Assert(enmBranch == IEMBRANCH_JUMP || enmBranch == IEMBRANCH_CALL);
    Assert((uSel & X86_SEL_MASK_OFF_RPL));
    IEM_CTX_IMPORT_RET(pVCpu, IEM_CPUMCTX_EXTRN_XCPT_MASK);

    if (IEM_IS_LONG_MODE(pVCpu))
        switch (pDesc->Legacy.Gen.u4Type)
        {
            case AMD64_SEL_TYPE_SYS_CALL_GATE:
                return iemCImpl_BranchCallGate(pVCpu, cbInstr, uSel, enmBranch, enmEffOpSize, pDesc);

            default:
            case AMD64_SEL_TYPE_SYS_LDT:
            case AMD64_SEL_TYPE_SYS_TSS_BUSY:
            case AMD64_SEL_TYPE_SYS_TSS_AVAIL:
            case AMD64_SEL_TYPE_SYS_TRAP_GATE:
            case AMD64_SEL_TYPE_SYS_INT_GATE:
                Log(("branch %04x -> wrong sys selector (64-bit): %d\n", uSel, pDesc->Legacy.Gen.u4Type));
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }

    switch (pDesc->Legacy.Gen.u4Type)
    {
        case X86_SEL_TYPE_SYS_286_CALL_GATE:
        case X86_SEL_TYPE_SYS_386_CALL_GATE:
            return iemCImpl_BranchCallGate(pVCpu, cbInstr, uSel, enmBranch, enmEffOpSize, pDesc);

        case X86_SEL_TYPE_SYS_TASK_GATE:
            return iemCImpl_BranchTaskGate(pVCpu, cbInstr, uSel, enmBranch, enmEffOpSize, pDesc);

        case X86_SEL_TYPE_SYS_286_TSS_AVAIL:
        case X86_SEL_TYPE_SYS_386_TSS_AVAIL:
            return iemCImpl_BranchTaskSegment(pVCpu, cbInstr, uSel, enmBranch, enmEffOpSize, pDesc);

        case X86_SEL_TYPE_SYS_286_TSS_BUSY:
            Log(("branch %04x -> busy 286 TSS\n", uSel));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);

        case X86_SEL_TYPE_SYS_386_TSS_BUSY:
            Log(("branch %04x -> busy 386 TSS\n", uSel));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);

        default:
        case X86_SEL_TYPE_SYS_LDT:
        case X86_SEL_TYPE_SYS_286_INT_GATE:
        case X86_SEL_TYPE_SYS_286_TRAP_GATE:
        case X86_SEL_TYPE_SYS_386_INT_GATE:
        case X86_SEL_TYPE_SYS_386_TRAP_GATE:
            Log(("branch %04x -> wrong sys selector: %d\n", uSel, pDesc->Legacy.Gen.u4Type));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
    }
}


/**
 * Implements far jumps.
 *
 * @param   uSel            The selector.
 * @param   offSeg          The segment offset.
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_3(iemCImpl_FarJmp, uint16_t, uSel, uint64_t, offSeg, IEMMODE, enmEffOpSize)
{
    NOREF(cbInstr);
    Assert(offSeg <= UINT32_MAX || (!IEM_IS_GUEST_CPU_AMD(pVCpu) && pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT));

    /*
     * Real mode and V8086 mode are easy.  The only snag seems to be that
     * CS.limit doesn't change and the limit check is done against the current
     * limit.
     */
    /** @todo Robert Collins claims (The Segment Descriptor Cache, DDJ August
     *        1998) that up to and including the Intel 486, far control
     *        transfers in real mode set default CS attributes (0x93) and also
     *        set a 64K segment limit. Starting with the Pentium, the
     *        attributes and limit are left alone but the access rights are
     *        ignored. We only implement the Pentium+ behavior.
     *  */
    if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
    {
        Assert(enmEffOpSize == IEMMODE_16BIT || enmEffOpSize == IEMMODE_32BIT);
        if (offSeg > pVCpu->cpum.GstCtx.cs.u32Limit)
        {
            Log(("iemCImpl_FarJmp: 16-bit limit\n"));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }

        if (enmEffOpSize == IEMMODE_16BIT) /** @todo WRONG, must pass this. */
            pVCpu->cpum.GstCtx.rip       = offSeg;
        else
            pVCpu->cpum.GstCtx.rip       = offSeg & UINT16_MAX;
        pVCpu->cpum.GstCtx.cs.Sel        = uSel;
        pVCpu->cpum.GstCtx.cs.ValidSel   = uSel;
        pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.cs.u64Base    = (uint32_t)uSel << 4;

        return iemRegFinishClearingRF(pVCpu);
    }

    /*
     * Protected mode. Need to parse the specified descriptor...
     */
    if (!(uSel & X86_SEL_MASK_OFF_RPL))
    {
        Log(("jmpf %04x:%08RX64 -> invalid selector, #GP(0)\n", uSel, offSeg));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /* Fetch the descriptor. */
    IEMSELDESC Desc;
    VBOXSTRICTRC rcStrict = iemMemFetchSelDesc(pVCpu, &Desc, uSel, X86_XCPT_GP);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Is it there? */
    if (!Desc.Legacy.Gen.u1Present) /** @todo this is probably checked too early. Testcase! */
    {
        Log(("jmpf %04x:%08RX64 -> segment not present\n", uSel, offSeg));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uSel);
    }

    /*
     * Deal with it according to its type.  We do the standard code selectors
     * here and dispatch the system selectors to worker functions.
     */
    if (!Desc.Legacy.Gen.u1DescType)
        return iemCImpl_BranchSysSel(pVCpu, cbInstr, uSel, IEMBRANCH_JUMP, enmEffOpSize, &Desc);

    /* Only code segments. */
    if (!(Desc.Legacy.Gen.u4Type & X86_SEL_TYPE_CODE))
    {
        Log(("jmpf %04x:%08RX64 -> not a code selector (u4Type=%#x).\n", uSel, offSeg, Desc.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
    }

    /* L vs D. */
    if (   Desc.Legacy.Gen.u1Long
        && Desc.Legacy.Gen.u1DefBig
        && IEM_IS_LONG_MODE(pVCpu))
    {
        Log(("jmpf %04x:%08RX64 -> both L and D are set.\n", uSel, offSeg));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
    }

    /* DPL/RPL/CPL check, where conforming segments makes a difference. */
    if (Desc.Legacy.Gen.u4Type & X86_SEL_TYPE_CONF)
    {
        if (pVCpu->iem.s.uCpl < Desc.Legacy.Gen.u2Dpl)
        {
            Log(("jmpf %04x:%08RX64 -> DPL violation (conforming); DPL=%d CPL=%u\n",
                 uSel, offSeg, Desc.Legacy.Gen.u2Dpl, pVCpu->iem.s.uCpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }
    }
    else
    {
        if (pVCpu->iem.s.uCpl != Desc.Legacy.Gen.u2Dpl)
        {
            Log(("jmpf %04x:%08RX64 -> CPL != DPL; DPL=%d CPL=%u\n", uSel, offSeg, Desc.Legacy.Gen.u2Dpl, pVCpu->iem.s.uCpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }
        if ((uSel & X86_SEL_RPL) > pVCpu->iem.s.uCpl)
        {
            Log(("jmpf %04x:%08RX64 -> RPL > DPL; RPL=%d CPL=%u\n", uSel, offSeg, (uSel & X86_SEL_RPL), pVCpu->iem.s.uCpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }
    }

    /* Chop the high bits if 16-bit (Intel says so). */
    if (enmEffOpSize == IEMMODE_16BIT)
        offSeg &= UINT16_MAX;

    /* Limit check and get the base.  */
    uint64_t u64Base;
    uint32_t cbLimit = X86DESC_LIMIT_G(&Desc.Legacy);
    if (   !Desc.Legacy.Gen.u1Long
        || !IEM_IS_LONG_MODE(pVCpu))
    {
        if (RT_LIKELY(offSeg <= cbLimit))
            u64Base = X86DESC_BASE(&Desc.Legacy);
        else
        {
            Log(("jmpf %04x:%08RX64 -> out of bounds (%#x)\n", uSel, offSeg, cbLimit));
            /** @todo Intel says this is \#GP(0)! */
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }
    }
    else
        u64Base = 0;

    /*
     * Ok, everything checked out fine.  Now set the accessed bit before
     * committing the result into CS, CSHID and RIP.
     */
    if (!(Desc.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
    {
        rcStrict = iemMemMarkSelDescAccessed(pVCpu, uSel);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        /** @todo check what VT-x and AMD-V does. */
        Desc.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
    }

    /* commit */
    pVCpu->cpum.GstCtx.rip = offSeg;
    pVCpu->cpum.GstCtx.cs.Sel         = uSel & X86_SEL_MASK_OFF_RPL;
    pVCpu->cpum.GstCtx.cs.Sel        |= pVCpu->iem.s.uCpl; /** @todo is this right for conforming segs? or in general? */
    pVCpu->cpum.GstCtx.cs.ValidSel    = pVCpu->cpum.GstCtx.cs.Sel;
    pVCpu->cpum.GstCtx.cs.fFlags      = CPUMSELREG_FLAGS_VALID;
    pVCpu->cpum.GstCtx.cs.Attr.u      = X86DESC_GET_HID_ATTR(&Desc.Legacy);
    pVCpu->cpum.GstCtx.cs.u32Limit    = cbLimit;
    pVCpu->cpum.GstCtx.cs.u64Base     = u64Base;
    pVCpu->iem.s.enmCpuMode  = iemCalcCpuMode(pVCpu);
    /** @todo check if the hidden bits are loaded correctly for 64-bit
     *        mode.  */

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr);

    return iemRegFinishClearingRF(pVCpu);
}


/**
 * Implements far calls.
 *
 * This very similar to iemCImpl_FarJmp.
 *
 * @param   uSel            The selector.
 * @param   offSeg          The segment offset.
 * @param   enmEffOpSize    The operand size (in case we need it).
 */
IEM_CIMPL_DEF_3(iemCImpl_callf, uint16_t, uSel, uint64_t, offSeg, IEMMODE, enmEffOpSize)
{
    VBOXSTRICTRC    rcStrict;
    uint64_t        uNewRsp;
    RTPTRUNION      uPtrRet;

    /*
     * Real mode and V8086 mode are easy.  The only snag seems to be that
     * CS.limit doesn't change and the limit check is done against the current
     * limit.
     */
    /** @todo See comment for similar code in iemCImpl_FarJmp */
    if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
    {
        Assert(enmEffOpSize == IEMMODE_16BIT || enmEffOpSize == IEMMODE_32BIT);

        /* Check stack first - may #SS(0). */
        rcStrict = iemMemStackPushBeginSpecial(pVCpu, enmEffOpSize == IEMMODE_32BIT ? 4+4 : 2+2,
                                               enmEffOpSize == IEMMODE_32BIT ? 3 : 1,
                                               &uPtrRet.pv, &uNewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;

        /* Check the target address range. */
/** @todo this must be wrong! Write unreal mode tests! */
        if (offSeg > UINT32_MAX)
            return iemRaiseGeneralProtectionFault0(pVCpu);

        /* Everything is fine, push the return address. */
        if (enmEffOpSize == IEMMODE_16BIT)
        {
            uPtrRet.pu16[0] = pVCpu->cpum.GstCtx.ip + cbInstr;
            uPtrRet.pu16[1] = pVCpu->cpum.GstCtx.cs.Sel;
        }
        else
        {
            uPtrRet.pu32[0] = pVCpu->cpum.GstCtx.eip + cbInstr;
            uPtrRet.pu16[2] = pVCpu->cpum.GstCtx.cs.Sel;
        }
        rcStrict = iemMemStackPushCommitSpecial(pVCpu, uPtrRet.pv, uNewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;

        /* Branch. */
        pVCpu->cpum.GstCtx.rip           = offSeg;
        pVCpu->cpum.GstCtx.cs.Sel        = uSel;
        pVCpu->cpum.GstCtx.cs.ValidSel   = uSel;
        pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.cs.u64Base    = (uint32_t)uSel << 4;

        return iemRegFinishClearingRF(pVCpu);
    }

    /*
     * Protected mode. Need to parse the specified descriptor...
     */
    if (!(uSel & X86_SEL_MASK_OFF_RPL))
    {
        Log(("callf %04x:%08RX64 -> invalid selector, #GP(0)\n", uSel, offSeg));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /* Fetch the descriptor. */
    IEMSELDESC Desc;
    rcStrict = iemMemFetchSelDesc(pVCpu, &Desc, uSel, X86_XCPT_GP);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /*
     * Deal with it according to its type.  We do the standard code selectors
     * here and dispatch the system selectors to worker functions.
     */
    if (!Desc.Legacy.Gen.u1DescType)
        return iemCImpl_BranchSysSel(pVCpu, cbInstr, uSel, IEMBRANCH_CALL, enmEffOpSize, &Desc);

    /* Only code segments. */
    if (!(Desc.Legacy.Gen.u4Type & X86_SEL_TYPE_CODE))
    {
        Log(("callf %04x:%08RX64 -> not a code selector (u4Type=%#x).\n", uSel, offSeg, Desc.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
    }

    /* L vs D. */
    if (   Desc.Legacy.Gen.u1Long
        && Desc.Legacy.Gen.u1DefBig
        && IEM_IS_LONG_MODE(pVCpu))
    {
        Log(("callf %04x:%08RX64 -> both L and D are set.\n", uSel, offSeg));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
    }

    /* DPL/RPL/CPL check, where conforming segments makes a difference. */
    if (Desc.Legacy.Gen.u4Type & X86_SEL_TYPE_CONF)
    {
        if (pVCpu->iem.s.uCpl < Desc.Legacy.Gen.u2Dpl)
        {
            Log(("callf %04x:%08RX64 -> DPL violation (conforming); DPL=%d CPL=%u\n",
                 uSel, offSeg, Desc.Legacy.Gen.u2Dpl, pVCpu->iem.s.uCpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }
    }
    else
    {
        if (pVCpu->iem.s.uCpl != Desc.Legacy.Gen.u2Dpl)
        {
            Log(("callf %04x:%08RX64 -> CPL != DPL; DPL=%d CPL=%u\n", uSel, offSeg, Desc.Legacy.Gen.u2Dpl, pVCpu->iem.s.uCpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }
        if ((uSel & X86_SEL_RPL) > pVCpu->iem.s.uCpl)
        {
            Log(("callf %04x:%08RX64 -> RPL > DPL; RPL=%d CPL=%u\n", uSel, offSeg, (uSel & X86_SEL_RPL), pVCpu->iem.s.uCpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }
    }

    /* Is it there? */
    if (!Desc.Legacy.Gen.u1Present)
    {
        Log(("callf %04x:%08RX64 -> segment not present\n", uSel, offSeg));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uSel);
    }

    /* Check stack first - may #SS(0). */
    /** @todo check how operand prefix affects pushing of CS! Does callf 16:32 in
     *        16-bit code cause a two or four byte CS to be pushed? */
    rcStrict = iemMemStackPushBeginSpecial(pVCpu,
                                           enmEffOpSize == IEMMODE_64BIT ? 8+8 : enmEffOpSize == IEMMODE_32BIT ? 4+4 : 2+2,
                                           enmEffOpSize == IEMMODE_64BIT ? 7   : enmEffOpSize == IEMMODE_32BIT ? 3   : 1,
                                           &uPtrRet.pv, &uNewRsp);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Chop the high bits if 16-bit (Intel says so). */
    if (enmEffOpSize == IEMMODE_16BIT)
        offSeg &= UINT16_MAX;

    /* Limit / canonical check. */
    uint64_t u64Base;
    uint32_t cbLimit = X86DESC_LIMIT_G(&Desc.Legacy);
    if (   !Desc.Legacy.Gen.u1Long
        || !IEM_IS_LONG_MODE(pVCpu))
    {
        if (RT_LIKELY(offSeg <= cbLimit))
            u64Base = X86DESC_BASE(&Desc.Legacy);
        else
        {
            Log(("jmpf %04x:%08RX64 -> out of bounds (%#x)\n", uSel, offSeg, cbLimit));
            /** @todo Intel says this is \#GP(0)! */
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }
    }
    else if (IEM_IS_CANONICAL(offSeg))
        u64Base = 0;
    else
    {
        Log(("callf %04x:%016RX64 - not canonical -> #GP\n", uSel, offSeg));
        return iemRaiseNotCanonical(pVCpu);
    }

    /*
     * Now set the accessed bit before
     * writing the return address to the stack and committing the result into
     * CS, CSHID and RIP.
     */
    /** @todo Testcase: Need to check WHEN exactly the accessed bit is set. */
    if (!(Desc.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
    {
        rcStrict = iemMemMarkSelDescAccessed(pVCpu, uSel);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        /** @todo check what VT-x and AMD-V does. */
        Desc.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
    }

    /* stack */
    if (enmEffOpSize == IEMMODE_16BIT)
    {
        uPtrRet.pu16[0] = pVCpu->cpum.GstCtx.ip + cbInstr;
        uPtrRet.pu16[1] = pVCpu->cpum.GstCtx.cs.Sel;
    }
    else if (enmEffOpSize == IEMMODE_32BIT)
    {
        uPtrRet.pu32[0] = pVCpu->cpum.GstCtx.eip + cbInstr;
        uPtrRet.pu32[1] = pVCpu->cpum.GstCtx.cs.Sel; /** @todo Testcase: What is written to the high word when callf is pushing CS? */
    }
    else
    {
        uPtrRet.pu64[0] = pVCpu->cpum.GstCtx.rip + cbInstr;
        uPtrRet.pu64[1] = pVCpu->cpum.GstCtx.cs.Sel; /** @todo Testcase: What is written to the high words when callf is pushing CS? */
    }
    rcStrict = iemMemStackPushCommitSpecial(pVCpu, uPtrRet.pv, uNewRsp);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* commit */
    pVCpu->cpum.GstCtx.rip = offSeg;
    pVCpu->cpum.GstCtx.cs.Sel         = uSel & X86_SEL_MASK_OFF_RPL;
    pVCpu->cpum.GstCtx.cs.Sel        |= pVCpu->iem.s.uCpl;
    pVCpu->cpum.GstCtx.cs.ValidSel    = pVCpu->cpum.GstCtx.cs.Sel;
    pVCpu->cpum.GstCtx.cs.fFlags      = CPUMSELREG_FLAGS_VALID;
    pVCpu->cpum.GstCtx.cs.Attr.u      = X86DESC_GET_HID_ATTR(&Desc.Legacy);
    pVCpu->cpum.GstCtx.cs.u32Limit    = cbLimit;
    pVCpu->cpum.GstCtx.cs.u64Base     = u64Base;
    pVCpu->iem.s.enmCpuMode  = iemCalcCpuMode(pVCpu);
    /** @todo check if the hidden bits are loaded correctly for 64-bit
     *        mode.  */

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr);

    return iemRegFinishClearingRF(pVCpu);
}


/**
 * Implements retf.
 *
 * @param   enmEffOpSize    The effective operand size.
 * @param   cbPop           The amount of arguments to pop from the stack
 *                          (bytes).
 */
IEM_CIMPL_DEF_2(iemCImpl_retf, IEMMODE, enmEffOpSize, uint16_t, cbPop)
{
    VBOXSTRICTRC    rcStrict;
    RTCPTRUNION     uPtrFrame;
    RTUINT64U       NewRsp;
    uint64_t        uNewRip;
    uint16_t        uNewCs;
    NOREF(cbInstr);

    /*
     * Read the stack values first.
     */
    uint32_t        cbRetPtr = enmEffOpSize == IEMMODE_16BIT ? 2+2
                             : enmEffOpSize == IEMMODE_32BIT ? 4+4 : 8+8;
    rcStrict = iemMemStackPopBeginSpecial(pVCpu, cbRetPtr,
                                          enmEffOpSize == IEMMODE_16BIT ? 1 : enmEffOpSize == IEMMODE_32BIT ? 3 : 7,
                                          &uPtrFrame.pv, &NewRsp.u);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;
    if (enmEffOpSize == IEMMODE_16BIT)
    {
        uNewRip = uPtrFrame.pu16[0];
        uNewCs  = uPtrFrame.pu16[1];
    }
    else if (enmEffOpSize == IEMMODE_32BIT)
    {
        uNewRip = uPtrFrame.pu32[0];
        uNewCs  = uPtrFrame.pu16[2];
    }
    else
    {
        uNewRip = uPtrFrame.pu64[0];
        uNewCs  = uPtrFrame.pu16[4];
    }
    rcStrict = iemMemStackPopDoneSpecial(pVCpu, uPtrFrame.pv);
    if (RT_LIKELY(rcStrict == VINF_SUCCESS))
    { /* extremely likely */ }
    else
        return rcStrict;

    /*
     * Real mode and V8086 mode are easy.
     */
    /** @todo See comment for similar code in iemCImpl_FarJmp */
    if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
    {
        Assert(enmEffOpSize == IEMMODE_32BIT || enmEffOpSize == IEMMODE_16BIT);
        /** @todo check how this is supposed to work if sp=0xfffe. */

        /* Check the limit of the new EIP. */
        /** @todo Intel pseudo code only does the limit check for 16-bit
         *        operands, AMD does not make any distinction. What is right? */
        if (uNewRip > pVCpu->cpum.GstCtx.cs.u32Limit)
            return iemRaiseSelectorBounds(pVCpu, X86_SREG_CS, IEM_ACCESS_INSTRUCTION);

        /* commit the operation. */
        if (cbPop)
            iemRegAddToRspEx(pVCpu, &NewRsp, cbPop);
        pVCpu->cpum.GstCtx.rsp           = NewRsp.u;
        pVCpu->cpum.GstCtx.rip           = uNewRip;
        pVCpu->cpum.GstCtx.cs.Sel        = uNewCs;
        pVCpu->cpum.GstCtx.cs.ValidSel   = uNewCs;
        pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.cs.u64Base    = (uint32_t)uNewCs << 4;
        return iemRegFinishClearingRF(pVCpu);
    }

    /*
     * Protected mode is complicated, of course.
     */
    if (!(uNewCs & X86_SEL_MASK_OFF_RPL))
    {
        Log(("retf %04x:%08RX64 -> invalid selector, #GP(0)\n", uNewCs, uNewRip));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_SREG_MASK | CPUMCTX_EXTRN_GDTR | CPUMCTX_EXTRN_LDTR);

    /* Fetch the descriptor. */
    IEMSELDESC DescCs;
    rcStrict = iemMemFetchSelDesc(pVCpu, &DescCs, uNewCs, X86_XCPT_GP);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Can only return to a code selector. */
    if (   !DescCs.Legacy.Gen.u1DescType
        || !(DescCs.Legacy.Gen.u4Type & X86_SEL_TYPE_CODE) )
    {
        Log(("retf %04x:%08RX64 -> not a code selector (u1DescType=%u u4Type=%#x).\n",
             uNewCs, uNewRip, DescCs.Legacy.Gen.u1DescType, DescCs.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }

    /* L vs D. */
    if (   DescCs.Legacy.Gen.u1Long /** @todo Testcase: far return to a selector with both L and D set. */
        && DescCs.Legacy.Gen.u1DefBig
        && IEM_IS_LONG_MODE(pVCpu))
    {
        Log(("retf %04x:%08RX64 -> both L & D set.\n", uNewCs, uNewRip));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }

    /* DPL/RPL/CPL checks. */
    if ((uNewCs & X86_SEL_RPL) < pVCpu->iem.s.uCpl)
    {
        Log(("retf %04x:%08RX64 -> RPL < CPL(%d).\n", uNewCs, uNewRip, pVCpu->iem.s.uCpl));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }

    if (DescCs.Legacy.Gen.u4Type & X86_SEL_TYPE_CONF)
    {
        if ((uNewCs & X86_SEL_RPL) < DescCs.Legacy.Gen.u2Dpl)
        {
            Log(("retf %04x:%08RX64 -> DPL violation (conforming); DPL=%u RPL=%u\n",
                 uNewCs, uNewRip, DescCs.Legacy.Gen.u2Dpl, (uNewCs & X86_SEL_RPL)));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
        }
    }
    else
    {
        if ((uNewCs & X86_SEL_RPL) != DescCs.Legacy.Gen.u2Dpl)
        {
            Log(("retf %04x:%08RX64 -> RPL != DPL; DPL=%u RPL=%u\n",
                 uNewCs, uNewRip, DescCs.Legacy.Gen.u2Dpl, (uNewCs & X86_SEL_RPL)));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
        }
    }

    /* Is it there? */
    if (!DescCs.Legacy.Gen.u1Present)
    {
        Log(("retf %04x:%08RX64 -> segment not present\n", uNewCs, uNewRip));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uNewCs);
    }

    /*
     * Return to outer privilege? (We'll typically have entered via a call gate.)
     */
    if ((uNewCs & X86_SEL_RPL) != pVCpu->iem.s.uCpl)
    {
        /* Read the outer stack pointer stored *after* the parameters. */
        rcStrict = iemMemStackPopContinueSpecial(pVCpu, cbPop /*off*/, cbRetPtr, &uPtrFrame.pv, NewRsp.u);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;

        uint16_t uNewOuterSs;
        RTUINT64U NewOuterRsp;
        if (enmEffOpSize == IEMMODE_16BIT)
        {
            NewOuterRsp.u = uPtrFrame.pu16[0];
            uNewOuterSs   = uPtrFrame.pu16[1];
        }
        else if (enmEffOpSize == IEMMODE_32BIT)
        {
            NewOuterRsp.u = uPtrFrame.pu32[0];
            uNewOuterSs   = uPtrFrame.pu16[2];
        }
        else
        {
            NewOuterRsp.u = uPtrFrame.pu64[0];
            uNewOuterSs   = uPtrFrame.pu16[4];
        }
        rcStrict = iemMemStackPopDoneSpecial(pVCpu, uPtrFrame.pv);
        if (RT_LIKELY(rcStrict == VINF_SUCCESS))
        { /* extremely likely */ }
        else
            return rcStrict;

        /* Check for NULL stack selector (invalid in ring-3 and non-long mode)
           and read the selector. */
        IEMSELDESC DescSs;
        if (!(uNewOuterSs & X86_SEL_MASK_OFF_RPL))
        {
            if (   !DescCs.Legacy.Gen.u1Long
                || (uNewOuterSs & X86_SEL_RPL) == 3)
            {
                Log(("retf %04x:%08RX64 %04x:%08RX64 -> invalid stack selector, #GP\n",
                     uNewCs, uNewRip, uNewOuterSs, NewOuterRsp.u));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }
            /** @todo Testcase: Return far to ring-1 or ring-2 with SS=0. */
            iemMemFakeStackSelDesc(&DescSs, (uNewOuterSs & X86_SEL_RPL));
        }
        else
        {
            /* Fetch the descriptor for the new stack segment. */
            rcStrict = iemMemFetchSelDesc(pVCpu, &DescSs, uNewOuterSs, X86_XCPT_GP);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
        }

        /* Check that RPL of stack and code selectors match. */
        if ((uNewCs & X86_SEL_RPL) != (uNewOuterSs & X86_SEL_RPL))
        {
            Log(("retf %04x:%08RX64 %04x:%08RX64 - SS.RPL != CS.RPL -> #GP(SS)\n", uNewCs, uNewRip, uNewOuterSs, NewOuterRsp.u));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewOuterSs);
        }

        /* Must be a writable data segment. */
        if (   !DescSs.Legacy.Gen.u1DescType
            || (DescSs.Legacy.Gen.u4Type & X86_SEL_TYPE_CODE)
            || !(DescSs.Legacy.Gen.u4Type & X86_SEL_TYPE_WRITE) )
        {
            Log(("retf %04x:%08RX64 %04x:%08RX64 - SS not a writable data segment (u1DescType=%u u4Type=%#x) -> #GP(SS).\n",
                 uNewCs, uNewRip, uNewOuterSs, NewOuterRsp.u, DescSs.Legacy.Gen.u1DescType, DescSs.Legacy.Gen.u4Type));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewOuterSs);
        }

        /* L vs D. (Not mentioned by intel.) */
        if (   DescSs.Legacy.Gen.u1Long /** @todo Testcase: far return to a stack selector with both L and D set. */
            && DescSs.Legacy.Gen.u1DefBig
            && IEM_IS_LONG_MODE(pVCpu))
        {
            Log(("retf %04x:%08RX64 %04x:%08RX64 - SS has both L & D set -> #GP(SS).\n",
                 uNewCs, uNewRip, uNewOuterSs, NewOuterRsp.u));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewOuterSs);
        }

        /* DPL/RPL/CPL checks. */
        if (DescSs.Legacy.Gen.u2Dpl != (uNewCs & X86_SEL_RPL))
        {
            Log(("retf %04x:%08RX64 %04x:%08RX64 - SS.DPL(%u) != CS.RPL (%u) -> #GP(SS).\n",
                 uNewCs, uNewRip, uNewOuterSs, NewOuterRsp.u, DescSs.Legacy.Gen.u2Dpl, uNewCs & X86_SEL_RPL));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewOuterSs);
        }

        /* Is it there? */
        if (!DescSs.Legacy.Gen.u1Present)
        {
            Log(("retf %04x:%08RX64 %04x:%08RX64 - SS not present -> #NP(SS).\n", uNewCs, uNewRip, uNewOuterSs, NewOuterRsp.u));
            return iemRaiseSelectorNotPresentBySelector(pVCpu, uNewCs);
        }

        /* Calc SS limit.*/
        uint32_t cbLimitSs = X86DESC_LIMIT_G(&DescSs.Legacy);

        /* Is RIP canonical or within CS.limit? */
        uint64_t u64Base;
        uint32_t cbLimitCs = X86DESC_LIMIT_G(&DescCs.Legacy);

        /** @todo Testcase: Is this correct? */
        if (   DescCs.Legacy.Gen.u1Long
            && IEM_IS_LONG_MODE(pVCpu) )
        {
            if (!IEM_IS_CANONICAL(uNewRip))
            {
                Log(("retf %04x:%08RX64 %04x:%08RX64 - not canonical -> #GP.\n", uNewCs, uNewRip, uNewOuterSs, NewOuterRsp.u));
                return iemRaiseNotCanonical(pVCpu);
            }
            u64Base = 0;
        }
        else
        {
            if (uNewRip > cbLimitCs)
            {
                Log(("retf %04x:%08RX64 %04x:%08RX64 - out of bounds (%#x)-> #GP(CS).\n",
                     uNewCs, uNewRip, uNewOuterSs, NewOuterRsp.u, cbLimitCs));
                /** @todo Intel says this is \#GP(0)! */
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
            }
            u64Base = X86DESC_BASE(&DescCs.Legacy);
        }

        /*
         * Now set the accessed bit before
         * writing the return address to the stack and committing the result into
         * CS, CSHID and RIP.
         */
        /** @todo Testcase: Need to check WHEN exactly the CS accessed bit is set. */
        if (!(DescCs.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
        {
            rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewCs);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
            /** @todo check what VT-x and AMD-V does. */
            DescCs.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
        }
        /** @todo Testcase: Need to check WHEN exactly the SS accessed bit is set. */
        if (!(DescSs.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
        {
            rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewOuterSs);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
            /** @todo check what VT-x and AMD-V does. */
            DescSs.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
        }

        /* commit */
        if (enmEffOpSize == IEMMODE_16BIT)
            pVCpu->cpum.GstCtx.rip           = uNewRip & UINT16_MAX; /** @todo Testcase: When exactly does this occur? With call it happens prior to the limit check according to Intel... */
        else
            pVCpu->cpum.GstCtx.rip           = uNewRip;
        pVCpu->cpum.GstCtx.cs.Sel            = uNewCs;
        pVCpu->cpum.GstCtx.cs.ValidSel       = uNewCs;
        pVCpu->cpum.GstCtx.cs.fFlags         = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.cs.Attr.u         = X86DESC_GET_HID_ATTR(&DescCs.Legacy);
        pVCpu->cpum.GstCtx.cs.u32Limit       = cbLimitCs;
        pVCpu->cpum.GstCtx.cs.u64Base        = u64Base;
        pVCpu->iem.s.enmCpuMode              = iemCalcCpuMode(pVCpu);
        pVCpu->cpum.GstCtx.ss.Sel            = uNewOuterSs;
        pVCpu->cpum.GstCtx.ss.ValidSel       = uNewOuterSs;
        pVCpu->cpum.GstCtx.ss.fFlags         = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.ss.Attr.u         = X86DESC_GET_HID_ATTR(&DescSs.Legacy);
        pVCpu->cpum.GstCtx.ss.u32Limit       = cbLimitSs;
        if (pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT)
            pVCpu->cpum.GstCtx.ss.u64Base    = 0;
        else
            pVCpu->cpum.GstCtx.ss.u64Base    = X86DESC_BASE(&DescSs.Legacy);
        if (cbPop)
            iemRegAddToRspEx(pVCpu, &NewOuterRsp, cbPop);
        if (pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT)
            pVCpu->cpum.GstCtx.rsp           = NewOuterRsp.u;
        else if (pVCpu->cpum.GstCtx.ss.Attr.n.u1DefBig)
            pVCpu->cpum.GstCtx.rsp           = (uint32_t)NewOuterRsp.u;
        else
            pVCpu->cpum.GstCtx.sp            = (uint16_t)NewOuterRsp.u;

        pVCpu->iem.s.uCpl                    = (uNewCs & X86_SEL_RPL);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCs & X86_SEL_RPL, &pVCpu->cpum.GstCtx.ds);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCs & X86_SEL_RPL, &pVCpu->cpum.GstCtx.es);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCs & X86_SEL_RPL, &pVCpu->cpum.GstCtx.fs);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCs & X86_SEL_RPL, &pVCpu->cpum.GstCtx.gs);

        /** @todo check if the hidden bits are loaded correctly for 64-bit
         *        mode. */
    }
    /*
     * Return to the same privilege level
     */
    else
    {
        /* Limit / canonical check. */
        uint64_t u64Base;
        uint32_t cbLimitCs = X86DESC_LIMIT_G(&DescCs.Legacy);

        /** @todo Testcase: Is this correct? */
        if (   DescCs.Legacy.Gen.u1Long
            && IEM_IS_LONG_MODE(pVCpu) )
        {
            if (!IEM_IS_CANONICAL(uNewRip))
            {
                Log(("retf %04x:%08RX64 - not canonical -> #GP\n", uNewCs, uNewRip));
                return iemRaiseNotCanonical(pVCpu);
            }
            u64Base = 0;
        }
        else
        {
            if (uNewRip > cbLimitCs)
            {
                Log(("retf %04x:%08RX64 -> out of bounds (%#x)\n", uNewCs, uNewRip, cbLimitCs));
                /** @todo Intel says this is \#GP(0)! */
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
            }
            u64Base = X86DESC_BASE(&DescCs.Legacy);
        }

        /*
         * Now set the accessed bit before
         * writing the return address to the stack and committing the result into
         * CS, CSHID and RIP.
         */
        /** @todo Testcase: Need to check WHEN exactly the accessed bit is set. */
        if (!(DescCs.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
        {
            rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewCs);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
            /** @todo check what VT-x and AMD-V does. */
            DescCs.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
        }

        /* commit */
        if (cbPop)
            iemRegAddToRspEx(pVCpu, &NewRsp, cbPop);
        if (!pVCpu->cpum.GstCtx.ss.Attr.n.u1DefBig)
            pVCpu->cpum.GstCtx.sp        = (uint16_t)NewRsp.u;
        else
            pVCpu->cpum.GstCtx.rsp       = NewRsp.u;
        if (enmEffOpSize == IEMMODE_16BIT)
            pVCpu->cpum.GstCtx.rip       = uNewRip & UINT16_MAX; /** @todo Testcase: When exactly does this occur? With call it happens prior to the limit check according to Intel... */
        else
            pVCpu->cpum.GstCtx.rip       = uNewRip;
        pVCpu->cpum.GstCtx.cs.Sel        = uNewCs;
        pVCpu->cpum.GstCtx.cs.ValidSel   = uNewCs;
        pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.cs.Attr.u     = X86DESC_GET_HID_ATTR(&DescCs.Legacy);
        pVCpu->cpum.GstCtx.cs.u32Limit   = cbLimitCs;
        pVCpu->cpum.GstCtx.cs.u64Base    = u64Base;
        /** @todo check if the hidden bits are loaded correctly for 64-bit
         *        mode.  */
        pVCpu->iem.s.enmCpuMode          = iemCalcCpuMode(pVCpu);
    }

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr); /** @todo use light flush for same privlege? */

    return iemRegFinishClearingRF(pVCpu);
}


/**
 * Implements retn and retn imm16.
 *
 * We're doing this in C because of the \#GP that might be raised if the popped
 * program counter is out of bounds.
 *
 * The hope with this forced inline worker function, is that the compiler will
 * be clever enough to eliminate unused code for the constant enmEffOpSize and
 * maybe cbPop parameters.
 *
 * @param   pVCpu           The cross context virtual CPU structure of the
 *                          calling thread.
 * @param   cbInstr         The current instruction length.
 * @param   enmEffOpSize    The effective operand size.  This is constant.
 * @param   cbPop           The amount of arguments to pop from the stack
 *                          (bytes).  This can be constant (zero).
 */
DECL_FORCE_INLINE(VBOXSTRICTRC) iemCImpl_ReturnNearCommon(PVMCPUCC pVCpu, uint8_t cbInstr, IEMMODE enmEffOpSize, uint16_t cbPop)
{
    /* Fetch the RSP from the stack. */
    VBOXSTRICTRC    rcStrict;
    RTUINT64U       NewRip;
    RTUINT64U       NewRsp;
    NewRsp.u = pVCpu->cpum.GstCtx.rsp;

    switch (enmEffOpSize)
    {
        case IEMMODE_16BIT:
            NewRip.u = 0;
            rcStrict = iemMemStackPopU16Ex(pVCpu, &NewRip.Words.w0, &NewRsp);
            break;
        case IEMMODE_32BIT:
            NewRip.u = 0;
            rcStrict = iemMemStackPopU32Ex(pVCpu, &NewRip.DWords.dw0, &NewRsp);
            break;
        case IEMMODE_64BIT:
            rcStrict = iemMemStackPopU64Ex(pVCpu, &NewRip.u, &NewRsp);
            break;
        IEM_NOT_REACHED_DEFAULT_CASE_RET();
    }
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Check the new RSP before loading it. */
    /** @todo Should test this as the intel+amd pseudo code doesn't mention half
     *        of it.  The canonical test is performed here and for call. */
    if (enmEffOpSize != IEMMODE_64BIT)
    {
        if (RT_LIKELY(NewRip.DWords.dw0 <= pVCpu->cpum.GstCtx.cs.u32Limit))
        { /* likely */ }
        else
        {
            Log(("retn newrip=%llx - out of bounds (%x) -> #GP\n", NewRip.u, pVCpu->cpum.GstCtx.cs.u32Limit));
            return iemRaiseSelectorBounds(pVCpu, X86_SREG_CS, IEM_ACCESS_INSTRUCTION);
        }
    }
    else
    {
        if (RT_LIKELY(IEM_IS_CANONICAL(NewRip.u)))
        { /* likely */ }
        else
        {
            Log(("retn newrip=%llx - not canonical -> #GP\n", NewRip.u));
            return iemRaiseNotCanonical(pVCpu);
        }
    }

    /* Apply cbPop */
    if (cbPop)
        iemRegAddToRspEx(pVCpu, &NewRsp, cbPop);

    /* Commit it. */
    pVCpu->cpum.GstCtx.rip = NewRip.u;
    pVCpu->cpum.GstCtx.rsp = NewRsp.u;

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr); /** @todo only need a light flush here, don't we?  We don't really need any flushing... */
    RT_NOREF(cbInstr);

    return iemRegFinishClearingRF(pVCpu);
}


/**
 * Implements retn imm16 with 16-bit effective operand size.
 *
 * @param   cbPop The amount of arguments to pop from the stack (bytes).
 */
IEM_CIMPL_DEF_1(iemCImpl_retn_iw_16, uint16_t, cbPop)
{
    return iemCImpl_ReturnNearCommon(pVCpu, cbInstr, IEMMODE_16BIT, cbPop);
}


/**
 * Implements retn imm16 with 32-bit effective operand size.
 *
 * @param   cbPop The amount of arguments to pop from the stack (bytes).
 */
IEM_CIMPL_DEF_1(iemCImpl_retn_iw_32, uint16_t, cbPop)
{
    return iemCImpl_ReturnNearCommon(pVCpu, cbInstr, IEMMODE_32BIT, cbPop);
}


/**
 * Implements retn imm16 with 64-bit effective operand size.
 *
 * @param   cbPop The amount of arguments to pop from the stack (bytes).
 */
IEM_CIMPL_DEF_1(iemCImpl_retn_iw_64, uint16_t, cbPop)
{
    return iemCImpl_ReturnNearCommon(pVCpu, cbInstr, IEMMODE_64BIT, cbPop);
}


/**
 * Implements retn with 16-bit effective operand size.
 */
IEM_CIMPL_DEF_0(iemCImpl_retn_16)
{
    return iemCImpl_ReturnNearCommon(pVCpu, cbInstr, IEMMODE_16BIT, 0);
}


/**
 * Implements retn with 32-bit effective operand size.
 */
IEM_CIMPL_DEF_0(iemCImpl_retn_32)
{
    return iemCImpl_ReturnNearCommon(pVCpu, cbInstr, IEMMODE_32BIT, 0);
}


/**
 * Implements retn with 64-bit effective operand size.
 */
IEM_CIMPL_DEF_0(iemCImpl_retn_64)
{
    return iemCImpl_ReturnNearCommon(pVCpu, cbInstr, IEMMODE_64BIT, 0);
}


/**
 * Implements enter.
 *
 * We're doing this in C because the instruction is insane, even for the
 * u8NestingLevel=0 case dealing with the stack is tedious.
 *
 * @param   enmEffOpSize    The effective operand size.
 * @param   cbFrame         Frame size.
 * @param   cParameters     Frame parameter count.
 */
IEM_CIMPL_DEF_3(iemCImpl_enter, IEMMODE, enmEffOpSize, uint16_t, cbFrame, uint8_t, cParameters)
{
    /* Push RBP, saving the old value in TmpRbp. */
    RTUINT64U       NewRsp; NewRsp.u = pVCpu->cpum.GstCtx.rsp;
    RTUINT64U       TmpRbp; TmpRbp.u = pVCpu->cpum.GstCtx.rbp;
    RTUINT64U       NewRbp;
    VBOXSTRICTRC    rcStrict;
    if (enmEffOpSize == IEMMODE_64BIT)
    {
        rcStrict = iemMemStackPushU64Ex(pVCpu, TmpRbp.u, &NewRsp);
        NewRbp = NewRsp;
    }
    else if (enmEffOpSize == IEMMODE_32BIT)
    {
        rcStrict = iemMemStackPushU32Ex(pVCpu, TmpRbp.DWords.dw0, &NewRsp);
        NewRbp = NewRsp;
    }
    else
    {
        rcStrict = iemMemStackPushU16Ex(pVCpu, TmpRbp.Words.w0, &NewRsp);
        NewRbp = TmpRbp;
        NewRbp.Words.w0 = NewRsp.Words.w0;
    }
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Copy the parameters (aka nesting levels by Intel). */
    cParameters &= 0x1f;
    if (cParameters > 0)
    {
        switch (enmEffOpSize)
        {
            case IEMMODE_16BIT:
                if (pVCpu->cpum.GstCtx.ss.Attr.n.u1DefBig)
                    TmpRbp.DWords.dw0 -= 2;
                else
                    TmpRbp.Words.w0   -= 2;
                do
                {
                    uint16_t u16Tmp;
                    rcStrict = iemMemStackPopU16Ex(pVCpu, &u16Tmp, &TmpRbp);
                    if (rcStrict != VINF_SUCCESS)
                        break;
                    rcStrict = iemMemStackPushU16Ex(pVCpu, u16Tmp, &NewRsp);
                } while (--cParameters > 0 && rcStrict == VINF_SUCCESS);
                break;

            case IEMMODE_32BIT:
                if (pVCpu->cpum.GstCtx.ss.Attr.n.u1DefBig)
                    TmpRbp.DWords.dw0 -= 4;
                else
                    TmpRbp.Words.w0   -= 4;
                do
                {
                    uint32_t u32Tmp;
                    rcStrict = iemMemStackPopU32Ex(pVCpu, &u32Tmp, &TmpRbp);
                    if (rcStrict != VINF_SUCCESS)
                        break;
                    rcStrict = iemMemStackPushU32Ex(pVCpu, u32Tmp, &NewRsp);
                } while (--cParameters > 0 && rcStrict == VINF_SUCCESS);
                break;

            case IEMMODE_64BIT:
                TmpRbp.u -= 8;
                do
                {
                    uint64_t u64Tmp;
                    rcStrict = iemMemStackPopU64Ex(pVCpu, &u64Tmp, &TmpRbp);
                    if (rcStrict != VINF_SUCCESS)
                        break;
                    rcStrict = iemMemStackPushU64Ex(pVCpu, u64Tmp, &NewRsp);
                } while (--cParameters > 0 && rcStrict == VINF_SUCCESS);
                break;

            IEM_NOT_REACHED_DEFAULT_CASE_RET();
        }
        if (rcStrict != VINF_SUCCESS)
            return VINF_SUCCESS;

        /* Push the new RBP */
        if (enmEffOpSize == IEMMODE_64BIT)
            rcStrict = iemMemStackPushU64Ex(pVCpu, NewRbp.u, &NewRsp);
        else if (enmEffOpSize == IEMMODE_32BIT)
            rcStrict = iemMemStackPushU32Ex(pVCpu, NewRbp.DWords.dw0, &NewRsp);
        else
            rcStrict = iemMemStackPushU16Ex(pVCpu, NewRbp.Words.w0, &NewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;

    }

    /* Recalc RSP. */
    iemRegSubFromRspEx(pVCpu, &NewRsp, cbFrame);

    /** @todo Should probe write access at the new RSP according to AMD. */
    /** @todo Should handle accesses to the VMX APIC-access page. */

    /* Commit it. */
    pVCpu->cpum.GstCtx.rbp = NewRbp.u;
    pVCpu->cpum.GstCtx.rsp = NewRsp.u;
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements leave.
 *
 * We're doing this in C because messing with the stack registers is annoying
 * since they depends on SS attributes.
 *
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_1(iemCImpl_leave, IEMMODE, enmEffOpSize)
{
    /* Calculate the intermediate RSP from RBP and the stack attributes. */
    RTUINT64U       NewRsp;
    if (pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT)
        NewRsp.u = pVCpu->cpum.GstCtx.rbp;
    else if (pVCpu->cpum.GstCtx.ss.Attr.n.u1DefBig)
        NewRsp.u = pVCpu->cpum.GstCtx.ebp;
    else
    {
        /** @todo Check that LEAVE actually preserve the high EBP bits. */
        NewRsp.u = pVCpu->cpum.GstCtx.rsp;
        NewRsp.Words.w0 = pVCpu->cpum.GstCtx.bp;
    }

    /* Pop RBP according to the operand size. */
    VBOXSTRICTRC    rcStrict;
    RTUINT64U       NewRbp;
    switch (enmEffOpSize)
    {
        case IEMMODE_16BIT:
            NewRbp.u = pVCpu->cpum.GstCtx.rbp;
            rcStrict = iemMemStackPopU16Ex(pVCpu, &NewRbp.Words.w0, &NewRsp);
            break;
        case IEMMODE_32BIT:
            NewRbp.u = 0;
            rcStrict = iemMemStackPopU32Ex(pVCpu, &NewRbp.DWords.dw0, &NewRsp);
            break;
        case IEMMODE_64BIT:
            rcStrict = iemMemStackPopU64Ex(pVCpu, &NewRbp.u, &NewRsp);
            break;
        IEM_NOT_REACHED_DEFAULT_CASE_RET();
    }
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;


    /* Commit it. */
    pVCpu->cpum.GstCtx.rbp = NewRbp.u;
    pVCpu->cpum.GstCtx.rsp = NewRsp.u;
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements int3 and int XX.
 *
 * @param   u8Int       The interrupt vector number.
 * @param   enmInt      The int instruction type.
 */
IEM_CIMPL_DEF_2(iemCImpl_int, uint8_t, u8Int, IEMINT, enmInt)
{
    Assert(pVCpu->iem.s.cXcptRecursions == 0);

    /*
     * We must check if this INT3 might belong to DBGF before raising a #BP.
     */
    if (u8Int == 3)
    {
        PVMCC pVM = pVCpu->CTX_SUFF(pVM);
        if (pVM->dbgf.ro.cEnabledInt3Breakpoints == 0)
        { /* likely: No vbox debugger breakpoints */ }
        else
        {
            VBOXSTRICTRC rcStrict = DBGFTrap03Handler(pVM, pVCpu, &pVCpu->cpum.GstCtx);
            Log(("iemCImpl_int: DBGFTrap03Handler -> %Rrc\n", VBOXSTRICTRC_VAL(rcStrict) ));
            if (rcStrict != VINF_EM_RAW_GUEST_TRAP)
                return iemSetPassUpStatus(pVCpu, rcStrict);
        }
    }
/** @todo single stepping   */
    return iemRaiseXcptOrInt(pVCpu,
                             cbInstr,
                             u8Int,
                             IEM_XCPT_FLAGS_T_SOFT_INT | enmInt,
                             0,
                             0);
}


/**
 * Implements iret for real mode and V8086 mode.
 *
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_1(iemCImpl_iret_real_v8086, IEMMODE, enmEffOpSize)
{
    X86EFLAGS Efl;
    Efl.u = IEMMISC_GET_EFL(pVCpu);
    NOREF(cbInstr);

    /*
     * iret throws an exception if VME isn't enabled.
     */
    if (   Efl.Bits.u1VM
        && Efl.Bits.u2IOPL != 3
        && !(pVCpu->cpum.GstCtx.cr4 & X86_CR4_VME))
        return iemRaiseGeneralProtectionFault0(pVCpu);

    /*
     * Do the stack bits, but don't commit RSP before everything checks
     * out right.
     */
    Assert(enmEffOpSize == IEMMODE_32BIT || enmEffOpSize == IEMMODE_16BIT);
    VBOXSTRICTRC    rcStrict;
    RTCPTRUNION     uFrame;
    uint16_t        uNewCs;
    uint32_t        uNewEip;
    uint32_t        uNewFlags;
    uint64_t        uNewRsp;
    if (enmEffOpSize == IEMMODE_32BIT)
    {
        rcStrict = iemMemStackPopBeginSpecial(pVCpu, 12, 1, &uFrame.pv, &uNewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        uNewEip    = uFrame.pu32[0];
        if (uNewEip > UINT16_MAX)
            return iemRaiseGeneralProtectionFault0(pVCpu);

        uNewCs     = (uint16_t)uFrame.pu32[1];
        uNewFlags  = uFrame.pu32[2];
        uNewFlags &= X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF
                   | X86_EFL_TF | X86_EFL_IF | X86_EFL_DF | X86_EFL_OF | X86_EFL_IOPL | X86_EFL_NT
                   | X86_EFL_RF /*| X86_EFL_VM*/ | X86_EFL_AC /*|X86_EFL_VIF*/ /*|X86_EFL_VIP*/
                   | X86_EFL_ID;
        if (IEM_GET_TARGET_CPU(pVCpu) <= IEMTARGETCPU_386)
            uNewFlags &= ~(X86_EFL_AC | X86_EFL_ID | X86_EFL_VIF | X86_EFL_VIP);
        uNewFlags |= Efl.u & (X86_EFL_VM | X86_EFL_VIF | X86_EFL_VIP | X86_EFL_1);
    }
    else
    {
        rcStrict = iemMemStackPopBeginSpecial(pVCpu, 6, 1, &uFrame.pv, &uNewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        uNewEip    = uFrame.pu16[0];
        uNewCs     = uFrame.pu16[1];
        uNewFlags  = uFrame.pu16[2];
        uNewFlags &= X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF
                   | X86_EFL_TF | X86_EFL_IF | X86_EFL_DF | X86_EFL_OF | X86_EFL_IOPL | X86_EFL_NT;
        uNewFlags |= Efl.u & ((UINT32_C(0xffff0000) | X86_EFL_1) & ~X86_EFL_RF);
        /** @todo The intel pseudo code does not indicate what happens to
         *        reserved flags. We just ignore them. */
        /* Ancient CPU adjustments: See iemCImpl_popf. */
        if (IEM_GET_TARGET_CPU(pVCpu) == IEMTARGETCPU_286)
            uNewFlags &= ~(X86_EFL_NT | X86_EFL_IOPL);
    }
    rcStrict = iemMemStackPopDoneSpecial(pVCpu, uFrame.pv);
    if (RT_LIKELY(rcStrict == VINF_SUCCESS))
    { /* extremely likely */ }
    else
        return rcStrict;

    /** @todo Check how this is supposed to work if sp=0xfffe. */
    Log7(("iemCImpl_iret_real_v8086: uNewCs=%#06x uNewRip=%#010x uNewFlags=%#x uNewRsp=%#18llx\n",
          uNewCs, uNewEip, uNewFlags, uNewRsp));

    /*
     * Check the limit of the new EIP.
     */
    /** @todo Only the AMD pseudo code check the limit here, what's
     *        right? */
    if (uNewEip > pVCpu->cpum.GstCtx.cs.u32Limit)
        return iemRaiseSelectorBounds(pVCpu, X86_SREG_CS, IEM_ACCESS_INSTRUCTION);

    /*
     * V8086 checks and flag adjustments
     */
    if (Efl.Bits.u1VM)
    {
        if (Efl.Bits.u2IOPL == 3)
        {
            /* Preserve IOPL and clear RF. */
            uNewFlags &=        ~(X86_EFL_IOPL | X86_EFL_RF);
            uNewFlags |= Efl.u & (X86_EFL_IOPL);
        }
        else if (   enmEffOpSize == IEMMODE_16BIT
                 && (   !(uNewFlags & X86_EFL_IF)
                     || !Efl.Bits.u1VIP )
                 && !(uNewFlags & X86_EFL_TF)   )
        {
            /* Move IF to VIF, clear RF and preserve IF and IOPL.*/
            uNewFlags &= ~X86_EFL_VIF;
            uNewFlags |= (uNewFlags & X86_EFL_IF) << (19 - 9);
            uNewFlags &=        ~(X86_EFL_IF | X86_EFL_IOPL | X86_EFL_RF);
            uNewFlags |= Efl.u & (X86_EFL_IF | X86_EFL_IOPL);
        }
        else
            return iemRaiseGeneralProtectionFault0(pVCpu);
        Log7(("iemCImpl_iret_real_v8086: u1VM=1: adjusted uNewFlags=%#x\n", uNewFlags));
    }

    /*
     * Commit the operation.
     */
#ifdef DBGFTRACE_ENABLED
    RTTraceBufAddMsgF(pVCpu->CTX_SUFF(pVM)->CTX_SUFF(hTraceBuf), "iret/rm %04x:%04x -> %04x:%04x %x %04llx",
                      pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.eip, uNewCs, uNewEip, uNewFlags, uNewRsp);
#endif
    pVCpu->cpum.GstCtx.rsp           = uNewRsp;
    pVCpu->cpum.GstCtx.rip           = uNewEip;
    pVCpu->cpum.GstCtx.cs.Sel        = uNewCs;
    pVCpu->cpum.GstCtx.cs.ValidSel   = uNewCs;
    pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;
    pVCpu->cpum.GstCtx.cs.u64Base    = (uint32_t)uNewCs << 4;
    /** @todo do we load attribs and limit as well? */
    Assert(uNewFlags & X86_EFL_1);
    IEMMISC_SET_EFL(pVCpu, uNewFlags);

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr); /** @todo can do light flush in real mode at least */

/** @todo single stepping   */
    return VINF_SUCCESS;
}


/**
 * Loads a segment register when entering V8086 mode.
 *
 * @param   pSReg           The segment register.
 * @param   uSeg            The segment to load.
 */
static void iemCImplCommonV8086LoadSeg(PCPUMSELREG pSReg, uint16_t uSeg)
{
    pSReg->Sel        = uSeg;
    pSReg->ValidSel   = uSeg;
    pSReg->fFlags     = CPUMSELREG_FLAGS_VALID;
    pSReg->u64Base    = (uint32_t)uSeg << 4;
    pSReg->u32Limit   = 0xffff;
    pSReg->Attr.u     = X86_SEL_TYPE_RW_ACC | RT_BIT(4) /*!sys*/ | RT_BIT(7) /*P*/ | (3 /*DPL*/ << 5); /* VT-x wants 0xf3 */
    /** @todo Testcase: Check if VT-x really needs this and what it does itself when
     *        IRET'ing to V8086. */
}


/**
 * Implements iret for protected mode returning to V8086 mode.
 *
 * @param   uNewEip         The new EIP.
 * @param   uNewCs          The new CS.
 * @param   uNewFlags       The new EFLAGS.
 * @param   uNewRsp         The RSP after the initial IRET frame.
 *
 * @note    This can only be a 32-bit iret du to the X86_EFL_VM position.
 */
IEM_CIMPL_DEF_4(iemCImpl_iret_prot_v8086, uint32_t, uNewEip, uint16_t, uNewCs, uint32_t, uNewFlags, uint64_t, uNewRsp)
{
    RT_NOREF_PV(cbInstr);
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_SREG_MASK);

    /*
     * Pop the V8086 specific frame bits off the stack.
     */
    VBOXSTRICTRC    rcStrict;
    RTCPTRUNION     uFrame;
    rcStrict = iemMemStackPopContinueSpecial(pVCpu, 0 /*off*/, 24 /*cbMem*/, &uFrame.pv, uNewRsp);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;
    uint32_t uNewEsp = uFrame.pu32[0];
    uint16_t uNewSs  = uFrame.pu32[1];
    uint16_t uNewEs  = uFrame.pu32[2];
    uint16_t uNewDs  = uFrame.pu32[3];
    uint16_t uNewFs  = uFrame.pu32[4];
    uint16_t uNewGs  = uFrame.pu32[5];
    rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)uFrame.pv, IEM_ACCESS_STACK_R); /* don't use iemMemStackPopCommitSpecial here. */
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /*
     * Commit the operation.
     */
    uNewFlags &= X86_EFL_LIVE_MASK;
    uNewFlags |= X86_EFL_RA1_MASK;
#ifdef DBGFTRACE_ENABLED
    RTTraceBufAddMsgF(pVCpu->CTX_SUFF(pVM)->CTX_SUFF(hTraceBuf), "iret/p/v %04x:%08x -> %04x:%04x %x %04x:%04x",
                      pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.eip, uNewCs, uNewEip, uNewFlags, uNewSs, uNewEsp);
#endif
    Log7(("iemCImpl_iret_prot_v8086: %04x:%08x -> %04x:%04x %x %04x:%04x\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.eip, uNewCs, uNewEip, uNewFlags, uNewSs, uNewEsp));

    IEMMISC_SET_EFL(pVCpu, uNewFlags);
    iemCImplCommonV8086LoadSeg(&pVCpu->cpum.GstCtx.cs, uNewCs);
    iemCImplCommonV8086LoadSeg(&pVCpu->cpum.GstCtx.ss, uNewSs);
    iemCImplCommonV8086LoadSeg(&pVCpu->cpum.GstCtx.es, uNewEs);
    iemCImplCommonV8086LoadSeg(&pVCpu->cpum.GstCtx.ds, uNewDs);
    iemCImplCommonV8086LoadSeg(&pVCpu->cpum.GstCtx.fs, uNewFs);
    iemCImplCommonV8086LoadSeg(&pVCpu->cpum.GstCtx.gs, uNewGs);
    pVCpu->cpum.GstCtx.rip      = (uint16_t)uNewEip;
    pVCpu->cpum.GstCtx.rsp      = uNewEsp; /** @todo check this out! */
    pVCpu->iem.s.uCpl  = 3;

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr);

/** @todo single stepping   */
    return VINF_SUCCESS;
}


/**
 * Implements iret for protected mode returning via a nested task.
 *
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_1(iemCImpl_iret_prot_NestedTask, IEMMODE, enmEffOpSize)
{
    Log7(("iemCImpl_iret_prot_NestedTask:\n"));
#ifndef IEM_IMPLEMENTS_TASKSWITCH
    IEM_RETURN_ASPECT_NOT_IMPLEMENTED();
#else
    RT_NOREF_PV(enmEffOpSize);

    /*
     * Read the segment selector in the link-field of the current TSS.
     */
    RTSEL        uSelRet;
    VBOXSTRICTRC rcStrict = iemMemFetchSysU16(pVCpu, &uSelRet, UINT8_MAX, pVCpu->cpum.GstCtx.tr.u64Base);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /*
     * Fetch the returning task's TSS descriptor from the GDT.
     */
    if (uSelRet & X86_SEL_LDT)
    {
        Log(("iret_prot_NestedTask TSS not in LDT. uSelRet=%04x -> #TS\n", uSelRet));
        return iemRaiseTaskSwitchFaultBySelector(pVCpu, uSelRet);
    }

    IEMSELDESC TssDesc;
    rcStrict = iemMemFetchSelDesc(pVCpu, &TssDesc, uSelRet, X86_XCPT_GP);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    if (TssDesc.Legacy.Gate.u1DescType)
    {
        Log(("iret_prot_NestedTask Invalid TSS type. uSelRet=%04x -> #TS\n", uSelRet));
        return iemRaiseTaskSwitchFaultBySelector(pVCpu, uSelRet & X86_SEL_MASK_OFF_RPL);
    }

    if (   TssDesc.Legacy.Gate.u4Type != X86_SEL_TYPE_SYS_286_TSS_BUSY
        && TssDesc.Legacy.Gate.u4Type != X86_SEL_TYPE_SYS_386_TSS_BUSY)
    {
        Log(("iret_prot_NestedTask TSS is not busy. uSelRet=%04x DescType=%#x -> #TS\n", uSelRet, TssDesc.Legacy.Gate.u4Type));
        return iemRaiseTaskSwitchFaultBySelector(pVCpu, uSelRet & X86_SEL_MASK_OFF_RPL);
    }

    if (!TssDesc.Legacy.Gate.u1Present)
    {
        Log(("iret_prot_NestedTask TSS is not present. uSelRet=%04x -> #NP\n", uSelRet));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uSelRet & X86_SEL_MASK_OFF_RPL);
    }

    uint32_t uNextEip = pVCpu->cpum.GstCtx.eip + cbInstr;
    return iemTaskSwitch(pVCpu, IEMTASKSWITCH_IRET, uNextEip, 0 /* fFlags */, 0 /* uErr */,
                         0 /* uCr2 */, uSelRet, &TssDesc);
#endif
}


/**
 * Implements iret for protected mode
 *
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_1(iemCImpl_iret_prot, IEMMODE, enmEffOpSize)
{
    NOREF(cbInstr);
    Assert(enmEffOpSize == IEMMODE_32BIT || enmEffOpSize == IEMMODE_16BIT);

    /*
     * Nested task return.
     */
    if (pVCpu->cpum.GstCtx.eflags.Bits.u1NT)
        return IEM_CIMPL_CALL_1(iemCImpl_iret_prot_NestedTask, enmEffOpSize);

    /*
     * Normal return.
     *
     * Do the stack bits, but don't commit RSP before everything checks
     * out right.
     */
    Assert(enmEffOpSize == IEMMODE_32BIT || enmEffOpSize == IEMMODE_16BIT);
    VBOXSTRICTRC    rcStrict;
    RTCPTRUNION     uFrame;
    uint16_t        uNewCs;
    uint32_t        uNewEip;
    uint32_t        uNewFlags;
    uint64_t        uNewRsp;
    if (enmEffOpSize == IEMMODE_32BIT)
    {
        rcStrict = iemMemStackPopBeginSpecial(pVCpu, 12, 3, &uFrame.pv, &uNewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        uNewEip    = uFrame.pu32[0];
        uNewCs     = (uint16_t)uFrame.pu32[1];
        uNewFlags  = uFrame.pu32[2];
    }
    else
    {
        rcStrict = iemMemStackPopBeginSpecial(pVCpu, 6, 1, &uFrame.pv, &uNewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        uNewEip    = uFrame.pu16[0];
        uNewCs     = uFrame.pu16[1];
        uNewFlags  = uFrame.pu16[2];
    }
    rcStrict = iemMemStackPopDoneSpecial(pVCpu, (void *)uFrame.pv); /* don't use iemMemStackPopCommitSpecial here. */
    if (RT_LIKELY(rcStrict == VINF_SUCCESS))
    { /* extremely likely */ }
    else
        return rcStrict;
    Log7(("iemCImpl_iret_prot: uNewCs=%#06x uNewEip=%#010x uNewFlags=%#x uNewRsp=%#18llx uCpl=%u\n", uNewCs, uNewEip, uNewFlags, uNewRsp, pVCpu->iem.s.uCpl));

    /*
     * We're hopefully not returning to V8086 mode...
     */
    if (   (uNewFlags & X86_EFL_VM)
        && pVCpu->iem.s.uCpl == 0)
    {
        Assert(enmEffOpSize == IEMMODE_32BIT);
        return IEM_CIMPL_CALL_4(iemCImpl_iret_prot_v8086, uNewEip, uNewCs, uNewFlags, uNewRsp);
    }

    /*
     * Protected mode.
     */
    /* Read the CS descriptor. */
    if (!(uNewCs & X86_SEL_MASK_OFF_RPL))
    {
        Log(("iret %04x:%08x -> invalid CS selector, #GP(0)\n", uNewCs, uNewEip));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    IEMSELDESC DescCS;
    rcStrict = iemMemFetchSelDesc(pVCpu, &DescCS, uNewCs, X86_XCPT_GP);
    if (rcStrict != VINF_SUCCESS)
    {
        Log(("iret %04x:%08x - rcStrict=%Rrc when fetching CS\n", uNewCs, uNewEip, VBOXSTRICTRC_VAL(rcStrict)));
        return rcStrict;
    }

    /* Must be a code descriptor. */
    if (!DescCS.Legacy.Gen.u1DescType)
    {
        Log(("iret %04x:%08x - CS is system segment (%#x) -> #GP\n", uNewCs, uNewEip, DescCS.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }
    if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_CODE))
    {
        Log(("iret %04x:%08x - not code segment (%#x) -> #GP\n", uNewCs, uNewEip, DescCS.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }

    /* Privilege checks. */
    if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_CONF))
    {
        if ((uNewCs & X86_SEL_RPL) != DescCS.Legacy.Gen.u2Dpl)
        {
            Log(("iret %04x:%08x - RPL != DPL (%d) -> #GP\n", uNewCs, uNewEip, DescCS.Legacy.Gen.u2Dpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
        }
    }
    else if ((uNewCs & X86_SEL_RPL) < DescCS.Legacy.Gen.u2Dpl)
    {
        Log(("iret %04x:%08x - RPL < DPL (%d) -> #GP\n", uNewCs, uNewEip, DescCS.Legacy.Gen.u2Dpl));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }
    if ((uNewCs & X86_SEL_RPL) < pVCpu->iem.s.uCpl)
    {
        Log(("iret %04x:%08x - RPL < CPL (%d) -> #GP\n", uNewCs, uNewEip, pVCpu->iem.s.uCpl));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }

    /* Present? */
    if (!DescCS.Legacy.Gen.u1Present)
    {
        Log(("iret %04x:%08x - CS not present -> #NP\n", uNewCs, uNewEip));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uNewCs);
    }

    uint32_t cbLimitCS = X86DESC_LIMIT_G(&DescCS.Legacy);

    /*
     * Return to outer level?
     */
    if ((uNewCs & X86_SEL_RPL) != pVCpu->iem.s.uCpl)
    {
        uint16_t    uNewSS;
        uint32_t    uNewESP;
        if (enmEffOpSize == IEMMODE_32BIT)
        {
            rcStrict = iemMemStackPopContinueSpecial(pVCpu, 0/*off*/, 8 /*cbMem*/, &uFrame.pv, uNewRsp);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
/** @todo We might be popping a 32-bit ESP from the IRET frame, but whether
 *        16-bit or 32-bit are being loaded into SP depends on the D/B
 *        bit of the popped SS selector it turns out. */
            uNewESP = uFrame.pu32[0];
            uNewSS  = (uint16_t)uFrame.pu32[1];
        }
        else
        {
            rcStrict = iemMemStackPopContinueSpecial(pVCpu, 0 /*off*/, 4 /*cbMem*/, &uFrame.pv, uNewRsp);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
            uNewESP = uFrame.pu16[0];
            uNewSS  = uFrame.pu16[1];
        }
        rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)uFrame.pv, IEM_ACCESS_STACK_R);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        Log7(("iemCImpl_iret_prot: uNewSS=%#06x uNewESP=%#010x\n", uNewSS, uNewESP));

        /* Read the SS descriptor. */
        if (!(uNewSS & X86_SEL_MASK_OFF_RPL))
        {
            Log(("iret %04x:%08x/%04x:%08x -> invalid SS selector, #GP(0)\n", uNewCs, uNewEip, uNewSS, uNewESP));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }

        IEMSELDESC DescSS;
        rcStrict = iemMemFetchSelDesc(pVCpu, &DescSS, uNewSS, X86_XCPT_GP); /** @todo Correct exception? */
        if (rcStrict != VINF_SUCCESS)
        {
            Log(("iret %04x:%08x/%04x:%08x - %Rrc when fetching SS\n",
                 uNewCs, uNewEip, uNewSS, uNewESP, VBOXSTRICTRC_VAL(rcStrict)));
            return rcStrict;
        }

        /* Privilege checks. */
        if ((uNewSS & X86_SEL_RPL) != (uNewCs & X86_SEL_RPL))
        {
            Log(("iret %04x:%08x/%04x:%08x -> SS.RPL != CS.RPL -> #GP\n", uNewCs, uNewEip, uNewSS, uNewESP));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewSS);
        }
        if (DescSS.Legacy.Gen.u2Dpl != (uNewCs & X86_SEL_RPL))
        {
            Log(("iret %04x:%08x/%04x:%08x -> SS.DPL (%d) != CS.RPL -> #GP\n",
                 uNewCs, uNewEip, uNewSS, uNewESP, DescSS.Legacy.Gen.u2Dpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewSS);
        }

        /* Must be a writeable data segment descriptor. */
        if (!DescSS.Legacy.Gen.u1DescType)
        {
            Log(("iret %04x:%08x/%04x:%08x -> SS is system segment (%#x) -> #GP\n",
                 uNewCs, uNewEip, uNewSS, uNewESP, DescSS.Legacy.Gen.u4Type));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewSS);
        }
        if ((DescSS.Legacy.Gen.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_WRITE)) != X86_SEL_TYPE_WRITE)
        {
            Log(("iret %04x:%08x/%04x:%08x - not writable data segment (%#x) -> #GP\n",
                 uNewCs, uNewEip, uNewSS, uNewESP, DescSS.Legacy.Gen.u4Type));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewSS);
        }

        /* Present? */
        if (!DescSS.Legacy.Gen.u1Present)
        {
            Log(("iret %04x:%08x/%04x:%08x -> SS not present -> #SS\n", uNewCs, uNewEip, uNewSS, uNewESP));
            return iemRaiseStackSelectorNotPresentBySelector(pVCpu, uNewSS);
        }

        uint32_t cbLimitSs = X86DESC_LIMIT_G(&DescSS.Legacy);

        /* Check EIP. */
        if (uNewEip > cbLimitCS)
        {
            Log(("iret %04x:%08x/%04x:%08x -> EIP is out of bounds (%#x) -> #GP(0)\n",
                 uNewCs, uNewEip, uNewSS, uNewESP, cbLimitCS));
            /** @todo Which is it, \#GP(0) or \#GP(sel)? */
            return iemRaiseSelectorBoundsBySelector(pVCpu, uNewCs);
        }

        /*
         * Commit the changes, marking CS and SS accessed first since
         * that may fail.
         */
        if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
        {
            rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewCs);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
            DescCS.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
        }
        if (!(DescSS.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
        {
            rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewSS);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
            DescSS.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
        }

        uint32_t fEFlagsMask = X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF
                             | X86_EFL_TF | X86_EFL_DF | X86_EFL_OF | X86_EFL_NT;
        if (enmEffOpSize != IEMMODE_16BIT)
            fEFlagsMask |= X86_EFL_RF | X86_EFL_AC | X86_EFL_ID;
        if (pVCpu->iem.s.uCpl == 0)
            fEFlagsMask |= X86_EFL_IF | X86_EFL_IOPL | X86_EFL_VIF | X86_EFL_VIP; /* VM is 0 */
        else if (pVCpu->iem.s.uCpl <= pVCpu->cpum.GstCtx.eflags.Bits.u2IOPL)
            fEFlagsMask |= X86_EFL_IF;
        if (IEM_GET_TARGET_CPU(pVCpu) <= IEMTARGETCPU_386)
            fEFlagsMask &= ~(X86_EFL_AC | X86_EFL_ID | X86_EFL_VIF | X86_EFL_VIP);
        uint32_t fEFlagsNew = IEMMISC_GET_EFL(pVCpu);
        fEFlagsNew         &= ~fEFlagsMask;
        fEFlagsNew         |= uNewFlags & fEFlagsMask;
#ifdef DBGFTRACE_ENABLED
        RTTraceBufAddMsgF(pVCpu->CTX_SUFF(pVM)->CTX_SUFF(hTraceBuf), "iret/%up%u %04x:%08x -> %04x:%04x %x %04x:%04x",
                          pVCpu->iem.s.uCpl, uNewCs & X86_SEL_RPL, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.eip,
                          uNewCs, uNewEip, uNewFlags,  uNewSS, uNewESP);
#endif

        IEMMISC_SET_EFL(pVCpu, fEFlagsNew);
        pVCpu->cpum.GstCtx.rip           = uNewEip;
        pVCpu->cpum.GstCtx.cs.Sel        = uNewCs;
        pVCpu->cpum.GstCtx.cs.ValidSel   = uNewCs;
        pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.cs.Attr.u     = X86DESC_GET_HID_ATTR(&DescCS.Legacy);
        pVCpu->cpum.GstCtx.cs.u32Limit   = cbLimitCS;
        pVCpu->cpum.GstCtx.cs.u64Base    = X86DESC_BASE(&DescCS.Legacy);
        pVCpu->iem.s.enmCpuMode = iemCalcCpuMode(pVCpu);

        pVCpu->cpum.GstCtx.ss.Sel        = uNewSS;
        pVCpu->cpum.GstCtx.ss.ValidSel   = uNewSS;
        pVCpu->cpum.GstCtx.ss.fFlags     = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.ss.Attr.u     = X86DESC_GET_HID_ATTR(&DescSS.Legacy);
        pVCpu->cpum.GstCtx.ss.u32Limit   = cbLimitSs;
        pVCpu->cpum.GstCtx.ss.u64Base    = X86DESC_BASE(&DescSS.Legacy);
        if (!pVCpu->cpum.GstCtx.ss.Attr.n.u1DefBig)
            pVCpu->cpum.GstCtx.sp        = (uint16_t)uNewESP;
        else
            pVCpu->cpum.GstCtx.rsp       = uNewESP;

        pVCpu->iem.s.uCpl       = uNewCs & X86_SEL_RPL;
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCs & X86_SEL_RPL, &pVCpu->cpum.GstCtx.ds);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCs & X86_SEL_RPL, &pVCpu->cpum.GstCtx.es);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCs & X86_SEL_RPL, &pVCpu->cpum.GstCtx.fs);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCs & X86_SEL_RPL, &pVCpu->cpum.GstCtx.gs);

        /* Done! */

    }
    /*
     * Return to the same level.
     */
    else
    {
        /* Check EIP. */
        if (uNewEip > cbLimitCS)
        {
            Log(("iret %04x:%08x - EIP is out of bounds (%#x) -> #GP(0)\n", uNewCs, uNewEip, cbLimitCS));
            /** @todo Which is it, \#GP(0) or \#GP(sel)? */
            return iemRaiseSelectorBoundsBySelector(pVCpu, uNewCs);
        }

        /*
         * Commit the changes, marking CS first since it may fail.
         */
        if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
        {
            rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewCs);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
            DescCS.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
        }

        X86EFLAGS NewEfl;
        NewEfl.u = IEMMISC_GET_EFL(pVCpu);
        uint32_t fEFlagsMask = X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF
                             | X86_EFL_TF | X86_EFL_DF | X86_EFL_OF | X86_EFL_NT;
        if (enmEffOpSize != IEMMODE_16BIT)
            fEFlagsMask |= X86_EFL_RF | X86_EFL_AC | X86_EFL_ID;
        if (pVCpu->iem.s.uCpl == 0)
            fEFlagsMask |= X86_EFL_IF | X86_EFL_IOPL | X86_EFL_VIF | X86_EFL_VIP; /* VM is 0 */
        else if (pVCpu->iem.s.uCpl <= NewEfl.Bits.u2IOPL)
            fEFlagsMask |= X86_EFL_IF;
        if (IEM_GET_TARGET_CPU(pVCpu) <= IEMTARGETCPU_386)
            fEFlagsMask &= ~(X86_EFL_AC | X86_EFL_ID | X86_EFL_VIF | X86_EFL_VIP);
        NewEfl.u           &= ~fEFlagsMask;
        NewEfl.u           |= fEFlagsMask & uNewFlags;
#ifdef DBGFTRACE_ENABLED
        RTTraceBufAddMsgF(pVCpu->CTX_SUFF(pVM)->CTX_SUFF(hTraceBuf), "iret/%up %04x:%08x -> %04x:%04x %x %04x:%04llx",
                          pVCpu->iem.s.uCpl, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.eip,
                          uNewCs, uNewEip, uNewFlags, pVCpu->cpum.GstCtx.ss.Sel, uNewRsp);
#endif

        IEMMISC_SET_EFL(pVCpu, NewEfl.u);
        pVCpu->cpum.GstCtx.rip           = uNewEip;
        pVCpu->cpum.GstCtx.cs.Sel        = uNewCs;
        pVCpu->cpum.GstCtx.cs.ValidSel   = uNewCs;
        pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.cs.Attr.u     = X86DESC_GET_HID_ATTR(&DescCS.Legacy);
        pVCpu->cpum.GstCtx.cs.u32Limit   = cbLimitCS;
        pVCpu->cpum.GstCtx.cs.u64Base    = X86DESC_BASE(&DescCS.Legacy);
        pVCpu->iem.s.enmCpuMode = iemCalcCpuMode(pVCpu);
        if (!pVCpu->cpum.GstCtx.ss.Attr.n.u1DefBig)
            pVCpu->cpum.GstCtx.sp        = (uint16_t)uNewRsp;
        else
            pVCpu->cpum.GstCtx.rsp       = uNewRsp;
        /* Done! */
    }

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr); /** @todo may light flush if same ring? */

/** @todo single stepping   */
    return VINF_SUCCESS;
}


/**
 * Implements iret for long mode
 *
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_1(iemCImpl_iret_64bit, IEMMODE, enmEffOpSize)
{
    NOREF(cbInstr);

    /*
     * Nested task return is not supported in long mode.
     */
    if (pVCpu->cpum.GstCtx.eflags.Bits.u1NT)
    {
        Log(("iretq with NT=1 (eflags=%#x) -> #GP(0)\n", pVCpu->cpum.GstCtx.eflags.u));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /*
     * Normal return.
     *
     * Do the stack bits, but don't commit RSP before everything checks
     * out right.
     */
    VBOXSTRICTRC    rcStrict;
    RTCPTRUNION     uFrame;
    uint64_t        uNewRip;
    uint16_t        uNewCs;
    uint16_t        uNewSs;
    uint32_t        uNewFlags;
    uint64_t        uNewRsp;
    if (enmEffOpSize == IEMMODE_64BIT)
    {
        rcStrict = iemMemStackPopBeginSpecial(pVCpu, 5*8, 7, &uFrame.pv, &uNewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        uNewRip    = uFrame.pu64[0];
        uNewCs     = (uint16_t)uFrame.pu64[1];
        uNewFlags  = (uint32_t)uFrame.pu64[2];
        uNewRsp    = uFrame.pu64[3];
        uNewSs     = (uint16_t)uFrame.pu64[4];
    }
    else if (enmEffOpSize == IEMMODE_32BIT)
    {
        rcStrict = iemMemStackPopBeginSpecial(pVCpu, 5*4, 3, &uFrame.pv, &uNewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        uNewRip    = uFrame.pu32[0];
        uNewCs     = (uint16_t)uFrame.pu32[1];
        uNewFlags  = uFrame.pu32[2];
        uNewRsp    = uFrame.pu32[3];
        uNewSs     = (uint16_t)uFrame.pu32[4];
    }
    else
    {
        Assert(enmEffOpSize == IEMMODE_16BIT);
        rcStrict = iemMemStackPopBeginSpecial(pVCpu, 5*2, 1, &uFrame.pv, &uNewRsp);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        uNewRip    = uFrame.pu16[0];
        uNewCs     = uFrame.pu16[1];
        uNewFlags  = uFrame.pu16[2];
        uNewRsp    = uFrame.pu16[3];
        uNewSs     = uFrame.pu16[4];
    }
    rcStrict = iemMemStackPopDoneSpecial(pVCpu, (void *)uFrame.pv); /* don't use iemMemStackPopCommitSpecial here. */
    if (RT_LIKELY(rcStrict == VINF_SUCCESS))
    { /* extremely like */ }
    else
        return rcStrict;
    Log7(("iretq stack: cs:rip=%04x:%016RX64 rflags=%016RX64 ss:rsp=%04x:%016RX64\n", uNewCs, uNewRip, uNewFlags, uNewSs, uNewRsp));

    /*
     * Check stuff.
     */
    /* Read the CS descriptor. */
    if (!(uNewCs & X86_SEL_MASK_OFF_RPL))
    {
        Log(("iret %04x:%016RX64/%04x:%016RX64 -> invalid CS selector, #GP(0)\n", uNewCs, uNewRip, uNewSs, uNewRsp));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    IEMSELDESC DescCS;
    rcStrict = iemMemFetchSelDesc(pVCpu, &DescCS, uNewCs, X86_XCPT_GP);
    if (rcStrict != VINF_SUCCESS)
    {
        Log(("iret %04x:%016RX64/%04x:%016RX64 - rcStrict=%Rrc when fetching CS\n",
             uNewCs, uNewRip, uNewSs, uNewRsp, VBOXSTRICTRC_VAL(rcStrict)));
        return rcStrict;
    }

    /* Must be a code descriptor. */
    if (   !DescCS.Legacy.Gen.u1DescType
        || !(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_CODE))
    {
        Log(("iret %04x:%016RX64/%04x:%016RX64 - CS is not a code segment T=%u T=%#xu -> #GP\n",
             uNewCs, uNewRip, uNewSs, uNewRsp, DescCS.Legacy.Gen.u1DescType, DescCS.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }

    /* Privilege checks. */
    uint8_t const uNewCpl = uNewCs & X86_SEL_RPL;
    if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_CONF))
    {
        if ((uNewCs & X86_SEL_RPL) != DescCS.Legacy.Gen.u2Dpl)
        {
            Log(("iret %04x:%016RX64 - RPL != DPL (%d) -> #GP\n", uNewCs, uNewRip, DescCS.Legacy.Gen.u2Dpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
        }
    }
    else if ((uNewCs & X86_SEL_RPL) < DescCS.Legacy.Gen.u2Dpl)
    {
        Log(("iret %04x:%016RX64 - RPL < DPL (%d) -> #GP\n", uNewCs, uNewRip, DescCS.Legacy.Gen.u2Dpl));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }
    if ((uNewCs & X86_SEL_RPL) < pVCpu->iem.s.uCpl)
    {
        Log(("iret %04x:%016RX64 - RPL < CPL (%d) -> #GP\n", uNewCs, uNewRip, pVCpu->iem.s.uCpl));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewCs);
    }

    /* Present? */
    if (!DescCS.Legacy.Gen.u1Present)
    {
        Log(("iret %04x:%016RX64/%04x:%016RX64 - CS not present -> #NP\n", uNewCs, uNewRip, uNewSs, uNewRsp));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uNewCs);
    }

    uint32_t cbLimitCS = X86DESC_LIMIT_G(&DescCS.Legacy);

    /* Read the SS descriptor. */
    IEMSELDESC DescSS;
    if (!(uNewSs & X86_SEL_MASK_OFF_RPL))
    {
        if (   !DescCS.Legacy.Gen.u1Long
            || DescCS.Legacy.Gen.u1DefBig /** @todo exactly how does iret (and others) behave with u1Long=1 and u1DefBig=1? \#GP(sel)? */
            || uNewCpl > 2) /** @todo verify SS=0 impossible for ring-3. */
        {
            Log(("iret %04x:%016RX64/%04x:%016RX64 -> invalid SS selector, #GP(0)\n", uNewCs, uNewRip, uNewSs, uNewRsp));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }
        /* Make sure SS is sensible, marked as accessed etc. */
        iemMemFakeStackSelDesc(&DescSS, (uNewSs & X86_SEL_RPL));
    }
    else
    {
        rcStrict = iemMemFetchSelDesc(pVCpu, &DescSS, uNewSs, X86_XCPT_GP); /** @todo Correct exception? */
        if (rcStrict != VINF_SUCCESS)
        {
            Log(("iret %04x:%016RX64/%04x:%016RX64 - %Rrc when fetching SS\n",
                 uNewCs, uNewRip, uNewSs, uNewRsp, VBOXSTRICTRC_VAL(rcStrict)));
            return rcStrict;
        }
    }

    /* Privilege checks. */
    if ((uNewSs & X86_SEL_RPL) != (uNewCs & X86_SEL_RPL))
    {
        Log(("iret %04x:%016RX64/%04x:%016RX64 -> SS.RPL != CS.RPL -> #GP\n", uNewCs, uNewRip, uNewSs, uNewRsp));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewSs);
    }

    uint32_t cbLimitSs;
    if (!(uNewSs & X86_SEL_MASK_OFF_RPL))
        cbLimitSs = UINT32_MAX;
    else
    {
        if (DescSS.Legacy.Gen.u2Dpl != (uNewCs & X86_SEL_RPL))
        {
            Log(("iret %04x:%016RX64/%04x:%016RX64 -> SS.DPL (%d) != CS.RPL -> #GP\n",
                 uNewCs, uNewRip, uNewSs, uNewRsp, DescSS.Legacy.Gen.u2Dpl));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewSs);
        }

        /* Must be a writeable data segment descriptor. */
        if (!DescSS.Legacy.Gen.u1DescType)
        {
            Log(("iret %04x:%016RX64/%04x:%016RX64 -> SS is system segment (%#x) -> #GP\n",
                 uNewCs, uNewRip, uNewSs, uNewRsp, DescSS.Legacy.Gen.u4Type));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewSs);
        }
        if ((DescSS.Legacy.Gen.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_WRITE)) != X86_SEL_TYPE_WRITE)
        {
            Log(("iret %04x:%016RX64/%04x:%016RX64 - not writable data segment (%#x) -> #GP\n",
                 uNewCs, uNewRip, uNewSs, uNewRsp, DescSS.Legacy.Gen.u4Type));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewSs);
        }

        /* Present? */
        if (!DescSS.Legacy.Gen.u1Present)
        {
            Log(("iret %04x:%016RX64/%04x:%016RX64 -> SS not present -> #SS\n", uNewCs, uNewRip, uNewSs, uNewRsp));
            return iemRaiseStackSelectorNotPresentBySelector(pVCpu, uNewSs);
        }
        cbLimitSs = X86DESC_LIMIT_G(&DescSS.Legacy);
    }

    /* Check EIP. */
    if (DescCS.Legacy.Gen.u1Long)
    {
        if (!IEM_IS_CANONICAL(uNewRip))
        {
            Log(("iret %04x:%016RX64/%04x:%016RX64 -> RIP is not canonical -> #GP(0)\n",
                 uNewCs, uNewRip, uNewSs, uNewRsp));
            return iemRaiseSelectorBoundsBySelector(pVCpu, uNewCs);
        }
    }
    else
    {
        if (uNewRip > cbLimitCS)
        {
            Log(("iret %04x:%016RX64/%04x:%016RX64 -> EIP is out of bounds (%#x) -> #GP(0)\n",
                 uNewCs, uNewRip, uNewSs, uNewRsp, cbLimitCS));
            /** @todo Which is it, \#GP(0) or \#GP(sel)? */
            return iemRaiseSelectorBoundsBySelector(pVCpu, uNewCs);
        }
    }

    /*
     * Commit the changes, marking CS and SS accessed first since
     * that may fail.
     */
    /** @todo where exactly are these actually marked accessed by a real CPU? */
    if (!(DescCS.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
    {
        rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewCs);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        DescCS.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
    }
    if (!(DescSS.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
    {
        rcStrict = iemMemMarkSelDescAccessed(pVCpu, uNewSs);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
        DescSS.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
    }

    uint32_t fEFlagsMask = X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF
                         | X86_EFL_TF | X86_EFL_DF | X86_EFL_OF | X86_EFL_NT;
    if (enmEffOpSize != IEMMODE_16BIT)
        fEFlagsMask |= X86_EFL_RF | X86_EFL_AC | X86_EFL_ID;
    if (pVCpu->iem.s.uCpl == 0)
        fEFlagsMask |= X86_EFL_IF | X86_EFL_IOPL | X86_EFL_VIF | X86_EFL_VIP; /* VM is ignored */
    else if (pVCpu->iem.s.uCpl <= pVCpu->cpum.GstCtx.eflags.Bits.u2IOPL)
        fEFlagsMask |= X86_EFL_IF;
    uint32_t fEFlagsNew = IEMMISC_GET_EFL(pVCpu);
    fEFlagsNew         &= ~fEFlagsMask;
    fEFlagsNew         |= uNewFlags & fEFlagsMask;
#ifdef DBGFTRACE_ENABLED
    RTTraceBufAddMsgF(pVCpu->CTX_SUFF(pVM)->CTX_SUFF(hTraceBuf), "iret/%ul%u %08llx -> %04x:%04llx %llx %04x:%04llx",
                      pVCpu->iem.s.uCpl, uNewCpl, pVCpu->cpum.GstCtx.rip, uNewCs, uNewRip, uNewFlags, uNewSs, uNewRsp);
#endif

    IEMMISC_SET_EFL(pVCpu, fEFlagsNew);
    pVCpu->cpum.GstCtx.rip           = uNewRip;
    pVCpu->cpum.GstCtx.cs.Sel        = uNewCs;
    pVCpu->cpum.GstCtx.cs.ValidSel   = uNewCs;
    pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;
    pVCpu->cpum.GstCtx.cs.Attr.u     = X86DESC_GET_HID_ATTR(&DescCS.Legacy);
    pVCpu->cpum.GstCtx.cs.u32Limit   = cbLimitCS;
    pVCpu->cpum.GstCtx.cs.u64Base    = X86DESC_BASE(&DescCS.Legacy);
    pVCpu->iem.s.enmCpuMode = iemCalcCpuMode(pVCpu);
    if (pVCpu->cpum.GstCtx.cs.Attr.n.u1Long || pVCpu->cpum.GstCtx.cs.Attr.n.u1DefBig)
        pVCpu->cpum.GstCtx.rsp       = uNewRsp;
    else
        pVCpu->cpum.GstCtx.sp        = (uint16_t)uNewRsp;
    pVCpu->cpum.GstCtx.ss.Sel        = uNewSs;
    pVCpu->cpum.GstCtx.ss.ValidSel   = uNewSs;
    if (!(uNewSs & X86_SEL_MASK_OFF_RPL))
    {
        pVCpu->cpum.GstCtx.ss.fFlags     = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.ss.Attr.u     = X86DESCATTR_UNUSABLE | (uNewCpl << X86DESCATTR_DPL_SHIFT);
        pVCpu->cpum.GstCtx.ss.u32Limit   = UINT32_MAX;
        pVCpu->cpum.GstCtx.ss.u64Base    = 0;
        Log2(("iretq new SS: NULL\n"));
    }
    else
    {
        pVCpu->cpum.GstCtx.ss.fFlags     = CPUMSELREG_FLAGS_VALID;
        pVCpu->cpum.GstCtx.ss.Attr.u     = X86DESC_GET_HID_ATTR(&DescSS.Legacy);
        pVCpu->cpum.GstCtx.ss.u32Limit   = cbLimitSs;
        pVCpu->cpum.GstCtx.ss.u64Base    = X86DESC_BASE(&DescSS.Legacy);
        Log2(("iretq new SS: base=%#RX64 lim=%#x attr=%#x\n", pVCpu->cpum.GstCtx.ss.u64Base, pVCpu->cpum.GstCtx.ss.u32Limit, pVCpu->cpum.GstCtx.ss.Attr.u));
    }

    if (pVCpu->iem.s.uCpl != uNewCpl)
    {
        pVCpu->iem.s.uCpl = uNewCpl;
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCpl, &pVCpu->cpum.GstCtx.ds);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCpl, &pVCpu->cpum.GstCtx.es);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCpl, &pVCpu->cpum.GstCtx.fs);
        iemHlpAdjustSelectorForNewCpl(pVCpu, uNewCpl, &pVCpu->cpum.GstCtx.gs);
    }

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr); /** @todo may light flush if the ring + mode doesn't change */

/** @todo single stepping   */
    return VINF_SUCCESS;
}


/**
 * Implements iret.
 *
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_1(iemCImpl_iret, IEMMODE, enmEffOpSize)
{
    bool fBlockingNmi = CPUMAreInterruptsInhibitedByNmi(&pVCpu->cpum.GstCtx);

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        /*
         * Record whether NMI (or virtual-NMI) blocking is in effect during the execution
         * of this IRET instruction. We need to provide this information as part of some
         * VM-exits.
         *
         * See Intel spec. 27.2.2 "Information for VM Exits Due to Vectored Events".
         */
        if (IEM_VMX_IS_PINCTLS_SET(pVCpu, VMX_PIN_CTLS_VIRT_NMI))
            pVCpu->cpum.GstCtx.hwvirt.vmx.fNmiUnblockingIret = pVCpu->cpum.GstCtx.hwvirt.vmx.fVirtNmiBlocking;
        else
            pVCpu->cpum.GstCtx.hwvirt.vmx.fNmiUnblockingIret = fBlockingNmi;

        /*
         * If "NMI exiting" is set, IRET does not affect blocking of NMIs.
         * See Intel Spec. 25.3 "Changes To Instruction Behavior In VMX Non-root Operation".
         */
        if (IEM_VMX_IS_PINCTLS_SET(pVCpu, VMX_PIN_CTLS_NMI_EXIT))
            fBlockingNmi = false;

        /* Clear virtual-NMI blocking, if any, before causing any further exceptions. */
        pVCpu->cpum.GstCtx.hwvirt.vmx.fVirtNmiBlocking = false;
    }
#endif

    /*
     * The SVM nested-guest intercept for IRET takes priority over all exceptions,
     * The NMI is still held pending (which I assume means blocking of further NMIs
     * is in effect).
     *
     * See AMD spec. 15.9 "Instruction Intercepts".
     * See AMD spec. 15.21.9 "NMI Support".
     */
    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_IRET))
    {
        Log(("iret: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_IRET, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * Clear NMI blocking, if any, before causing any further exceptions.
     * See Intel spec. 6.7.1 "Handling Multiple NMIs".
     */
    if (fBlockingNmi)
        CPUMClearInterruptInhibitingByNmi(&pVCpu->cpum.GstCtx);

    /*
     * Call a mode specific worker.
     */
    if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
        return IEM_CIMPL_CALL_1(iemCImpl_iret_real_v8086, enmEffOpSize);
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_SREG_MASK | CPUMCTX_EXTRN_GDTR | CPUMCTX_EXTRN_LDTR);
    if (pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT)
        return IEM_CIMPL_CALL_1(iemCImpl_iret_64bit, enmEffOpSize);
    return     IEM_CIMPL_CALL_1(iemCImpl_iret_prot, enmEffOpSize);
}


static void iemLoadallSetSelector(PVMCPUCC pVCpu, uint8_t iSegReg, uint16_t uSel)
{
    PCPUMSELREGHID  pHid = iemSRegGetHid(pVCpu, iSegReg);

    pHid->Sel      = uSel;
    pHid->ValidSel = uSel;
    pHid->fFlags   = CPUMSELREG_FLAGS_VALID;
}


static void iemLoadall286SetDescCache(PVMCPUCC pVCpu, uint8_t iSegReg, uint8_t const *pbMem)
{
    PCPUMSELREGHID  pHid = iemSRegGetHid(pVCpu, iSegReg);

    /* The base is in the first three bytes. */
    pHid->u64Base  = pbMem[0] + (pbMem[1] << 8) + (pbMem[2] << 16);
    /* The attributes are in the fourth byte. */
    pHid->Attr.u   = pbMem[3];
    /* The limit is in the last two bytes. */
    pHid->u32Limit = pbMem[4] + (pbMem[5] << 8);
}


/**
 * Implements 286 LOADALL (286 CPUs only).
 */
IEM_CIMPL_DEF_0(iemCImpl_loadall286)
{
    NOREF(cbInstr);

    /* Data is loaded from a buffer at 800h. No checks are done on the
     * validity of loaded state.
     *
     * LOADALL only loads the internal CPU state, it does not access any
     * GDT, LDT, or similar tables.
     */

    if (pVCpu->iem.s.uCpl != 0)
    {
        Log(("loadall286: CPL must be 0 not %u -> #GP(0)\n", pVCpu->iem.s.uCpl));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    uint8_t const *pbMem = NULL;
    uint16_t const *pa16Mem;
    uint8_t const *pa8Mem;
    RTGCPHYS GCPtrStart = 0x800;    /* Fixed table location. */
    VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, (void **)&pbMem, 0x66, UINT8_MAX, GCPtrStart, IEM_ACCESS_SYS_R, 0);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* The MSW is at offset 0x06. */
    pa16Mem = (uint16_t const *)(pbMem + 0x06);
    /* Even LOADALL can't clear the MSW.PE bit, though it can set it. */
    uint64_t uNewCr0 = pVCpu->cpum.GstCtx.cr0 & ~(X86_CR0_MP | X86_CR0_EM | X86_CR0_TS);
    uNewCr0 |= *pa16Mem & (X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS);
    uint64_t const uOldCr0 = pVCpu->cpum.GstCtx.cr0;

    CPUMSetGuestCR0(pVCpu, uNewCr0);
    Assert(pVCpu->cpum.GstCtx.cr0 == uNewCr0);

    /* Inform PGM if mode changed. */
    if ((uNewCr0 & X86_CR0_PE) != (uOldCr0 & X86_CR0_PE))
    {
        int rc = PGMFlushTLB(pVCpu, pVCpu->cpum.GstCtx.cr3, true /* global */);
        AssertRCReturn(rc, rc);
        /* ignore informational status codes */
    }
    rcStrict = PGMChangeMode(pVCpu, pVCpu->cpum.GstCtx.cr0, pVCpu->cpum.GstCtx.cr4, pVCpu->cpum.GstCtx.msrEFER,
                             false /* fForce */);

    /* TR selector is at offset 0x16. */
    pa16Mem = (uint16_t const *)(pbMem + 0x16);
    pVCpu->cpum.GstCtx.tr.Sel      = pa16Mem[0];
    pVCpu->cpum.GstCtx.tr.ValidSel = pa16Mem[0];
    pVCpu->cpum.GstCtx.tr.fFlags   = CPUMSELREG_FLAGS_VALID;

    /* Followed by FLAGS... */
    pVCpu->cpum.GstCtx.eflags.u = pa16Mem[1] | X86_EFL_1;
    pVCpu->cpum.GstCtx.ip       = pa16Mem[2];   /* ...and IP. */

    /* LDT is at offset 0x1C. */
    pa16Mem = (uint16_t const *)(pbMem + 0x1C);
    pVCpu->cpum.GstCtx.ldtr.Sel      = pa16Mem[0];
    pVCpu->cpum.GstCtx.ldtr.ValidSel = pa16Mem[0];
    pVCpu->cpum.GstCtx.ldtr.fFlags   = CPUMSELREG_FLAGS_VALID;

    /* Segment registers are at offset 0x1E. */
    pa16Mem = (uint16_t const *)(pbMem + 0x1E);
    iemLoadallSetSelector(pVCpu, X86_SREG_DS, pa16Mem[0]);
    iemLoadallSetSelector(pVCpu, X86_SREG_SS, pa16Mem[1]);
    iemLoadallSetSelector(pVCpu, X86_SREG_CS, pa16Mem[2]);
    iemLoadallSetSelector(pVCpu, X86_SREG_ES, pa16Mem[3]);

    /* GPRs are at offset 0x26. */
    pa16Mem = (uint16_t const *)(pbMem + 0x26);
    pVCpu->cpum.GstCtx.di = pa16Mem[0];
    pVCpu->cpum.GstCtx.si = pa16Mem[1];
    pVCpu->cpum.GstCtx.bp = pa16Mem[2];
    pVCpu->cpum.GstCtx.sp = pa16Mem[3];
    pVCpu->cpum.GstCtx.bx = pa16Mem[4];
    pVCpu->cpum.GstCtx.dx = pa16Mem[5];
    pVCpu->cpum.GstCtx.cx = pa16Mem[6];
    pVCpu->cpum.GstCtx.ax = pa16Mem[7];

    /* Descriptor caches are at offset 0x36, 6 bytes per entry. */
    iemLoadall286SetDescCache(pVCpu, X86_SREG_ES, pbMem + 0x36);
    iemLoadall286SetDescCache(pVCpu, X86_SREG_CS, pbMem + 0x3C);
    iemLoadall286SetDescCache(pVCpu, X86_SREG_SS, pbMem + 0x42);
    iemLoadall286SetDescCache(pVCpu, X86_SREG_DS, pbMem + 0x48);

    /* GDTR contents are at offset 0x4E, 6 bytes. */
    RTGCPHYS GCPtrBase;
    uint16_t cbLimit;
    pa8Mem = pbMem + 0x4E;
    /* NB: Fourth byte "should be zero"; we are ignoring it. */
    GCPtrBase = pa8Mem[0] + (pa8Mem[1] << 8) + (pa8Mem[2] << 16);
    cbLimit = pa8Mem[4] + (pa8Mem[5] << 8);
    CPUMSetGuestGDTR(pVCpu, GCPtrBase, cbLimit);

    /* IDTR contents are at offset 0x5A, 6 bytes. */
    pa8Mem = pbMem + 0x5A;
    GCPtrBase = pa8Mem[0] + (pa8Mem[1] << 8) + (pa8Mem[2] << 16);
    cbLimit = pa8Mem[4] + (pa8Mem[5] << 8);
    CPUMSetGuestIDTR(pVCpu, GCPtrBase, cbLimit);

    Log(("LOADALL: GDTR:%08RX64/%04X, IDTR:%08RX64/%04X\n", pVCpu->cpum.GstCtx.gdtr.pGdt, pVCpu->cpum.GstCtx.gdtr.cbGdt, pVCpu->cpum.GstCtx.idtr.pIdt, pVCpu->cpum.GstCtx.idtr.cbIdt));
    Log(("LOADALL: CS:%04X, CS base:%08X, limit:%04X, attrs:%02X\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.cs.u64Base, pVCpu->cpum.GstCtx.cs.u32Limit, pVCpu->cpum.GstCtx.cs.Attr.u));
    Log(("LOADALL: DS:%04X, DS base:%08X, limit:%04X, attrs:%02X\n", pVCpu->cpum.GstCtx.ds.Sel, pVCpu->cpum.GstCtx.ds.u64Base, pVCpu->cpum.GstCtx.ds.u32Limit, pVCpu->cpum.GstCtx.ds.Attr.u));
    Log(("LOADALL: ES:%04X, ES base:%08X, limit:%04X, attrs:%02X\n", pVCpu->cpum.GstCtx.es.Sel, pVCpu->cpum.GstCtx.es.u64Base, pVCpu->cpum.GstCtx.es.u32Limit, pVCpu->cpum.GstCtx.es.Attr.u));
    Log(("LOADALL: SS:%04X, SS base:%08X, limit:%04X, attrs:%02X\n", pVCpu->cpum.GstCtx.ss.Sel, pVCpu->cpum.GstCtx.ss.u64Base, pVCpu->cpum.GstCtx.ss.u32Limit, pVCpu->cpum.GstCtx.ss.Attr.u));
    Log(("LOADALL: SI:%04X, DI:%04X, AX:%04X, BX:%04X, CX:%04X, DX:%04X\n", pVCpu->cpum.GstCtx.si, pVCpu->cpum.GstCtx.di, pVCpu->cpum.GstCtx.bx, pVCpu->cpum.GstCtx.bx, pVCpu->cpum.GstCtx.cx, pVCpu->cpum.GstCtx.dx));

    rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)pbMem, IEM_ACCESS_SYS_R);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* The CPL may change. It is taken from the "DPL fields of the SS and CS
     * descriptor caches" but there is no word as to what happens if those are
     * not identical (probably bad things).
     */
    pVCpu->iem.s.uCpl = pVCpu->cpum.GstCtx.cs.Attr.n.u2Dpl;

    CPUMSetChangedFlags(pVCpu, CPUM_CHANGED_HIDDEN_SEL_REGS | CPUM_CHANGED_IDTR | CPUM_CHANGED_GDTR | CPUM_CHANGED_TR | CPUM_CHANGED_LDTR);

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr);

/** @todo single stepping   */
    return rcStrict;
}


/**
 * Implements SYSCALL (AMD and Intel64).
 */
IEM_CIMPL_DEF_0(iemCImpl_syscall)
{
    /** @todo hack, LOADALL should be decoded as such on a 286. */
    if (RT_UNLIKELY(pVCpu->iem.s.uTargetCpu == IEMTARGETCPU_286))
        return iemCImpl_loadall286(pVCpu, cbInstr);

    /*
     * Check preconditions.
     *
     * Note that CPUs described in the documentation may load a few odd values
     * into CS and SS than we allow here.  This has yet to be checked on real
     * hardware.
     */
    if (!(pVCpu->cpum.GstCtx.msrEFER & MSR_K6_EFER_SCE))
    {
        Log(("syscall: Not enabled in EFER -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (!(pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE))
    {
        Log(("syscall: Protected mode is required -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    if (IEM_IS_GUEST_CPU_INTEL(pVCpu) && !CPUMIsGuestInLongModeEx(IEM_GET_CTX(pVCpu)))
    {
        Log(("syscall: Only available in long mode on intel -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_SYSCALL_MSRS);

    /** @todo verify RPL ignoring and CS=0xfff8 (i.e. SS == 0). */
    /** @todo what about LDT selectors? Shouldn't matter, really. */
    uint16_t uNewCs = (pVCpu->cpum.GstCtx.msrSTAR >> MSR_K6_STAR_SYSCALL_CS_SS_SHIFT) & X86_SEL_MASK_OFF_RPL;
    uint16_t uNewSs = uNewCs + 8;
    if (uNewCs == 0 || uNewSs == 0)
    {
        /** @todo Neither Intel nor AMD document this check. */
        Log(("syscall: msrSTAR.CS = 0 or SS = 0 -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /* Long mode and legacy mode differs. */
    if (CPUMIsGuestInLongModeEx(IEM_GET_CTX(pVCpu)))
    {
        uint64_t uNewRip = pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT ? pVCpu->cpum.GstCtx.msrLSTAR : pVCpu->cpum.GstCtx. msrCSTAR;

        /* This test isn't in the docs, but I'm not trusting the guys writing
           the MSRs to have validated the values as canonical like they should. */
        if (!IEM_IS_CANONICAL(uNewRip))
        {
            /** @todo Intel claims this can't happen because IA32_LSTAR MSR can't be written with non-canonical address. */
            Log(("syscall: New RIP not canonical -> #UD\n"));
            return iemRaiseUndefinedOpcode(pVCpu);
        }

        /*
         * Commit it.
         */
        Log(("syscall: %04x:%016RX64 [efl=%#llx] -> %04x:%016RX64\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pVCpu->cpum.GstCtx.rflags.u, uNewCs, uNewRip));
        pVCpu->cpum.GstCtx.rcx           = pVCpu->cpum.GstCtx.rip + cbInstr;
        pVCpu->cpum.GstCtx.rip           = uNewRip;

        pVCpu->cpum.GstCtx.rflags.u     &= ~X86_EFL_RF;
        pVCpu->cpum.GstCtx.r11           = pVCpu->cpum.GstCtx.rflags.u;
        pVCpu->cpum.GstCtx.rflags.u     &= ~pVCpu->cpum.GstCtx.msrSFMASK;
        pVCpu->cpum.GstCtx.rflags.u     |= X86_EFL_1;

        pVCpu->cpum.GstCtx.cs.Attr.u     = X86DESCATTR_P | X86DESCATTR_G | X86DESCATTR_L | X86DESCATTR_DT | X86_SEL_TYPE_ER_ACC;
        pVCpu->cpum.GstCtx.ss.Attr.u     = X86DESCATTR_P | X86DESCATTR_G | X86DESCATTR_D | X86DESCATTR_DT | X86_SEL_TYPE_RW_ACC;
    }
    else
    {
        /*
         * Commit it.
         */
        Log(("syscall: %04x:%08RX32 [efl=%#x] -> %04x:%08RX32\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.eip, pVCpu->cpum.GstCtx.eflags.u, uNewCs, (uint32_t)(pVCpu->cpum.GstCtx.msrSTAR & MSR_K6_STAR_SYSCALL_EIP_MASK)));
        pVCpu->cpum.GstCtx.rcx           = pVCpu->cpum.GstCtx.eip + cbInstr;
        pVCpu->cpum.GstCtx.rip           = pVCpu->cpum.GstCtx.msrSTAR & MSR_K6_STAR_SYSCALL_EIP_MASK;
        pVCpu->cpum.GstCtx.rflags.u     &= ~(X86_EFL_VM | X86_EFL_IF | X86_EFL_RF);

        pVCpu->cpum.GstCtx.cs.Attr.u     = X86DESCATTR_P | X86DESCATTR_G | X86DESCATTR_D | X86DESCATTR_DT | X86_SEL_TYPE_ER_ACC;
        pVCpu->cpum.GstCtx.ss.Attr.u     = X86DESCATTR_P | X86DESCATTR_G | X86DESCATTR_D | X86DESCATTR_DT | X86_SEL_TYPE_RW_ACC;
    }
    pVCpu->cpum.GstCtx.cs.Sel        = uNewCs;
    pVCpu->cpum.GstCtx.cs.ValidSel   = uNewCs;
    pVCpu->cpum.GstCtx.cs.u64Base    = 0;
    pVCpu->cpum.GstCtx.cs.u32Limit   = UINT32_MAX;
    pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;

    pVCpu->cpum.GstCtx.ss.Sel        = uNewSs;
    pVCpu->cpum.GstCtx.ss.ValidSel   = uNewSs;
    pVCpu->cpum.GstCtx.ss.u64Base    = 0;
    pVCpu->cpum.GstCtx.ss.u32Limit   = UINT32_MAX;
    pVCpu->cpum.GstCtx.ss.fFlags     = CPUMSELREG_FLAGS_VALID;

    pVCpu->iem.s.uCpl       = 0;
    pVCpu->iem.s.enmCpuMode = iemCalcCpuMode(pVCpu);

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr);

/** @todo single step   */
    return VINF_SUCCESS;
}


/**
 * Implements SYSRET (AMD and Intel64).
 */
IEM_CIMPL_DEF_0(iemCImpl_sysret)

{
    RT_NOREF_PV(cbInstr);

    /*
     * Check preconditions.
     *
     * Note that CPUs described in the documentation may load a few odd values
     * into CS and SS than we allow here.  This has yet to be checked on real
     * hardware.
     */
    if (!(pVCpu->cpum.GstCtx.msrEFER & MSR_K6_EFER_SCE))
    {
        Log(("sysret: Not enabled in EFER -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (IEM_IS_GUEST_CPU_INTEL(pVCpu) && !CPUMIsGuestInLongModeEx(IEM_GET_CTX(pVCpu)))
    {
        Log(("sysret: Only available in long mode on intel -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (!(pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE))
    {
        Log(("sysret: Protected mode is required -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    if (pVCpu->iem.s.uCpl != 0)
    {
        Log(("sysret: CPL must be 0 not %u -> #GP(0)\n", pVCpu->iem.s.uCpl));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_SYSCALL_MSRS);

    /** @todo Does SYSRET verify CS != 0 and SS != 0? Neither is valid in ring-3. */
    uint16_t uNewCs = (pVCpu->cpum.GstCtx.msrSTAR >> MSR_K6_STAR_SYSRET_CS_SS_SHIFT) & X86_SEL_MASK_OFF_RPL;
    uint16_t uNewSs = uNewCs + 8;
    if (pVCpu->iem.s.enmEffOpSize == IEMMODE_64BIT)
        uNewCs += 16;
    if (uNewCs == 0 || uNewSs == 0)
    {
        Log(("sysret: msrSTAR.CS = 0 or SS = 0 -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /*
     * Commit it.
     */
    if (CPUMIsGuestInLongModeEx(IEM_GET_CTX(pVCpu)))
    {
        if (pVCpu->iem.s.enmEffOpSize == IEMMODE_64BIT)
        {
            Log(("sysret: %04x:%016RX64 [efl=%#llx] -> %04x:%016RX64 [r11=%#llx]\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pVCpu->cpum.GstCtx.rflags.u, uNewCs, pVCpu->cpum.GstCtx.rcx, pVCpu->cpum.GstCtx.r11));
            /* Note! We disregard intel manual regarding the RCX canonical
                     check, ask intel+xen why AMD doesn't do it. */
            pVCpu->cpum.GstCtx.rip       = pVCpu->cpum.GstCtx.rcx;
            pVCpu->cpum.GstCtx.cs.Attr.u = X86DESCATTR_P | X86DESCATTR_G | X86DESCATTR_L | X86DESCATTR_DT | X86_SEL_TYPE_ER_ACC
                            | (3 << X86DESCATTR_DPL_SHIFT);
        }
        else
        {
            Log(("sysret: %04x:%016RX64 [efl=%#llx] -> %04x:%08RX32 [r11=%#llx]\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pVCpu->cpum.GstCtx.rflags.u, uNewCs, pVCpu->cpum.GstCtx.ecx, pVCpu->cpum.GstCtx.r11));
            pVCpu->cpum.GstCtx.rip       = pVCpu->cpum.GstCtx.ecx;
            pVCpu->cpum.GstCtx.cs.Attr.u = X86DESCATTR_P | X86DESCATTR_G | X86DESCATTR_D | X86DESCATTR_DT | X86_SEL_TYPE_ER_ACC
                            | (3 << X86DESCATTR_DPL_SHIFT);
        }
        /** @todo testcase: See what kind of flags we can make SYSRET restore and
         *        what it really ignores. RF and VM are hinted at being zero, by AMD.
         *        Intel says:  RFLAGS := (R11 & 3C7FD7H) | 2; */
        pVCpu->cpum.GstCtx.rflags.u      = pVCpu->cpum.GstCtx.r11 & (X86_EFL_POPF_BITS | X86_EFL_VIF | X86_EFL_VIP);
        pVCpu->cpum.GstCtx.rflags.u     |= X86_EFL_1;
    }
    else
    {
        Log(("sysret: %04x:%08RX32 [efl=%#x] -> %04x:%08RX32\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.eip, pVCpu->cpum.GstCtx.eflags.u, uNewCs, pVCpu->cpum.GstCtx.ecx));
        pVCpu->cpum.GstCtx.rip           = pVCpu->cpum.GstCtx.rcx;
        pVCpu->cpum.GstCtx.rflags.u     |= X86_EFL_IF;
        pVCpu->cpum.GstCtx.cs.Attr.u     = X86DESCATTR_P | X86DESCATTR_G | X86DESCATTR_D | X86DESCATTR_DT | X86_SEL_TYPE_ER_ACC
                            | (3 << X86DESCATTR_DPL_SHIFT);
    }
    pVCpu->cpum.GstCtx.cs.Sel        = uNewCs | 3;
    pVCpu->cpum.GstCtx.cs.ValidSel   = uNewCs | 3;
    pVCpu->cpum.GstCtx.cs.u64Base    = 0;
    pVCpu->cpum.GstCtx.cs.u32Limit   = UINT32_MAX;
    pVCpu->cpum.GstCtx.cs.fFlags     = CPUMSELREG_FLAGS_VALID;

    pVCpu->cpum.GstCtx.ss.Sel        = uNewSs | 3;
    pVCpu->cpum.GstCtx.ss.ValidSel   = uNewSs | 3;
    pVCpu->cpum.GstCtx.ss.fFlags     = CPUMSELREG_FLAGS_VALID;
    /* The SS hidden bits remains unchanged says AMD. To that I say "Yeah, right!". */
    pVCpu->cpum.GstCtx.ss.Attr.u    |= (3 << X86DESCATTR_DPL_SHIFT);
    /** @todo Testcase: verify that SS.u1Long and SS.u1DefBig are left unchanged
     *        on sysret. */

    pVCpu->iem.s.uCpl       = 3;
    pVCpu->iem.s.enmCpuMode = iemCalcCpuMode(pVCpu);

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr);

/** @todo single step   */
    return VINF_SUCCESS;
}


/**
 * Implements SYSENTER (Intel, 32-bit AMD).
 */
IEM_CIMPL_DEF_0(iemCImpl_sysenter)
{
    RT_NOREF(cbInstr);

    /*
     * Check preconditions.
     *
     * Note that CPUs described in the documentation may load a few odd values
     * into CS and SS than we allow here.  This has yet to be checked on real
     * hardware.
     */
    if (!IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fSysEnter)
    {
        Log(("sysenter: not supported -=> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (!(pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE))
    {
        Log(("sysenter: Protected or long mode is required -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    bool fIsLongMode = CPUMIsGuestInLongModeEx(IEM_GET_CTX(pVCpu));
    if (IEM_IS_GUEST_CPU_AMD(pVCpu) && fIsLongMode)
    {
        Log(("sysenter: Only available in protected mode on AMD -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_SYSENTER_MSRS);
    uint16_t uNewCs = pVCpu->cpum.GstCtx.SysEnter.cs;
    if ((uNewCs & X86_SEL_MASK_OFF_RPL) == 0)
    {
        Log(("sysenter: SYSENTER_CS = %#x -> #GP(0)\n", uNewCs));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /* This test isn't in the docs, it's just a safeguard against missing
       canonical checks when writing the registers. */
    if (RT_LIKELY(   !fIsLongMode
                  || (   IEM_IS_CANONICAL(pVCpu->cpum.GstCtx.SysEnter.eip)
                      && IEM_IS_CANONICAL(pVCpu->cpum.GstCtx.SysEnter.esp))))
    { /* likely */ }
    else
    {
        Log(("sysenter: SYSENTER_EIP = %#RX64 or/and SYSENTER_ESP = %#RX64 not canonical -> #GP(0)\n",
             pVCpu->cpum.GstCtx.SysEnter.eip, pVCpu->cpum.GstCtx.SysEnter.esp));
        return iemRaiseUndefinedOpcode(pVCpu);
    }

/** @todo Test: Sysenter from ring-0, ring-1 and ring-2.  */

    /*
     * Update registers and commit.
     */
    if (fIsLongMode)
    {
        Log(("sysenter: %04x:%016RX64 [efl=%#llx] -> %04x:%016RX64\n", pVCpu->cpum.GstCtx.cs, pVCpu->cpum.GstCtx.rip,
             pVCpu->cpum.GstCtx.rflags.u, uNewCs & X86_SEL_MASK_OFF_RPL, pVCpu->cpum.GstCtx.SysEnter.eip));
        pVCpu->cpum.GstCtx.rip          = pVCpu->cpum.GstCtx.SysEnter.eip;
        pVCpu->cpum.GstCtx.rsp          = pVCpu->cpum.GstCtx.SysEnter.esp;
        pVCpu->cpum.GstCtx.cs.Attr.u    = X86DESCATTR_L | X86DESCATTR_G | X86DESCATTR_P | X86DESCATTR_DT
                                        | X86DESCATTR_LIMIT_HIGH | X86_SEL_TYPE_ER_ACC;
    }
    else
    {
        Log(("sysenter: %04x:%08RX32 [efl=%#llx] -> %04x:%08RX32\n", pVCpu->cpum.GstCtx.cs, (uint32_t)pVCpu->cpum.GstCtx.rip,
             pVCpu->cpum.GstCtx.rflags.u, uNewCs & X86_SEL_MASK_OFF_RPL, (uint32_t)pVCpu->cpum.GstCtx.SysEnter.eip));
        pVCpu->cpum.GstCtx.rip          = (uint32_t)pVCpu->cpum.GstCtx.SysEnter.eip;
        pVCpu->cpum.GstCtx.rsp          = (uint32_t)pVCpu->cpum.GstCtx.SysEnter.esp;
        pVCpu->cpum.GstCtx.cs.Attr.u    = X86DESCATTR_D | X86DESCATTR_G | X86DESCATTR_P | X86DESCATTR_DT
                                        | X86DESCATTR_LIMIT_HIGH | X86_SEL_TYPE_ER_ACC;
    }
    pVCpu->cpum.GstCtx.cs.Sel           = uNewCs & X86_SEL_MASK_OFF_RPL;
    pVCpu->cpum.GstCtx.cs.ValidSel      = uNewCs & X86_SEL_MASK_OFF_RPL;
    pVCpu->cpum.GstCtx.cs.u64Base       = 0;
    pVCpu->cpum.GstCtx.cs.u32Limit      = UINT32_MAX;
    pVCpu->cpum.GstCtx.cs.fFlags        = CPUMSELREG_FLAGS_VALID;

    pVCpu->cpum.GstCtx.ss.Sel           = (uNewCs & X86_SEL_MASK_OFF_RPL) + 8;
    pVCpu->cpum.GstCtx.ss.ValidSel      = (uNewCs & X86_SEL_MASK_OFF_RPL) + 8;
    pVCpu->cpum.GstCtx.ss.u64Base       = 0;
    pVCpu->cpum.GstCtx.ss.u32Limit      = UINT32_MAX;
    pVCpu->cpum.GstCtx.ss.Attr.u        = X86DESCATTR_D | X86DESCATTR_G | X86DESCATTR_P | X86DESCATTR_DT
                                        | X86DESCATTR_LIMIT_HIGH | X86_SEL_TYPE_RW_ACC;
    pVCpu->cpum.GstCtx.ss.fFlags        = CPUMSELREG_FLAGS_VALID;

    pVCpu->cpum.GstCtx.rflags.Bits.u1IF = 0;
    pVCpu->cpum.GstCtx.rflags.Bits.u1VM = 0;
    pVCpu->cpum.GstCtx.rflags.Bits.u1RF = 0;

    pVCpu->iem.s.uCpl                   = 0;

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr);

/** @todo single stepping   */
    return VINF_SUCCESS;
}


/**
 * Implements SYSEXIT (Intel, 32-bit AMD).
 *
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_1(iemCImpl_sysexit, IEMMODE, enmEffOpSize)
{
    RT_NOREF(cbInstr);

    /*
     * Check preconditions.
     *
     * Note that CPUs described in the documentation may load a few odd values
     * into CS and SS than we allow here.  This has yet to be checked on real
     * hardware.
     */
    if (!IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fSysEnter)
    {
        Log(("sysexit: not supported -=> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (!(pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE))
    {
        Log(("sysexit: Protected or long mode is required -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    bool fIsLongMode = CPUMIsGuestInLongModeEx(IEM_GET_CTX(pVCpu));
    if (IEM_IS_GUEST_CPU_AMD(pVCpu) && fIsLongMode)
    {
        Log(("sysexit: Only available in protected mode on AMD -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (pVCpu->iem.s.uCpl != 0)
    {
        Log(("sysexit: CPL(=%u) != 0 -> #GP(0)\n", pVCpu->iem.s.uCpl));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_SYSENTER_MSRS);
    uint16_t uNewCs = pVCpu->cpum.GstCtx.SysEnter.cs;
    if ((uNewCs & X86_SEL_MASK_OFF_RPL) == 0)
    {
        Log(("sysexit: SYSENTER_CS = %#x -> #GP(0)\n", uNewCs));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /*
     * Update registers and commit.
     */
    if (enmEffOpSize == IEMMODE_64BIT)
    {
        Log(("sysexit: %04x:%016RX64 [efl=%#llx] -> %04x:%016RX64\n", pVCpu->cpum.GstCtx.cs, pVCpu->cpum.GstCtx.rip,
             pVCpu->cpum.GstCtx.rflags.u, (uNewCs | 3) + 32, pVCpu->cpum.GstCtx.rcx));
        pVCpu->cpum.GstCtx.rip          = pVCpu->cpum.GstCtx.rdx;
        pVCpu->cpum.GstCtx.rsp          = pVCpu->cpum.GstCtx.rcx;
        pVCpu->cpum.GstCtx.cs.Attr.u    = X86DESCATTR_L | X86DESCATTR_G | X86DESCATTR_P | X86DESCATTR_DT
                                        | X86DESCATTR_LIMIT_HIGH | X86_SEL_TYPE_ER_ACC | (3 << X86DESCATTR_DPL_SHIFT);
        pVCpu->cpum.GstCtx.cs.Sel       = (uNewCs | 3) + 32;
        pVCpu->cpum.GstCtx.cs.ValidSel  = (uNewCs | 3) + 32;
        pVCpu->cpum.GstCtx.ss.Sel       = (uNewCs | 3) + 40;
        pVCpu->cpum.GstCtx.ss.ValidSel  = (uNewCs | 3) + 40;
    }
    else
    {
        Log(("sysexit: %04x:%08RX64 [efl=%#llx] -> %04x:%08RX32\n", pVCpu->cpum.GstCtx.cs, pVCpu->cpum.GstCtx.rip,
             pVCpu->cpum.GstCtx.rflags.u, (uNewCs | 3) + 16, (uint32_t)pVCpu->cpum.GstCtx.edx));
        pVCpu->cpum.GstCtx.rip          = pVCpu->cpum.GstCtx.edx;
        pVCpu->cpum.GstCtx.rsp          = pVCpu->cpum.GstCtx.ecx;
        pVCpu->cpum.GstCtx.cs.Attr.u    = X86DESCATTR_D | X86DESCATTR_G | X86DESCATTR_P | X86DESCATTR_DT
                                        | X86DESCATTR_LIMIT_HIGH | X86_SEL_TYPE_ER_ACC | (3 << X86DESCATTR_DPL_SHIFT);
        pVCpu->cpum.GstCtx.cs.Sel       = (uNewCs | 3) + 16;
        pVCpu->cpum.GstCtx.cs.ValidSel  = (uNewCs | 3) + 16;
        pVCpu->cpum.GstCtx.ss.Sel       = (uNewCs | 3) + 24;
        pVCpu->cpum.GstCtx.ss.ValidSel  = (uNewCs | 3) + 24;
    }
    pVCpu->cpum.GstCtx.cs.u64Base       = 0;
    pVCpu->cpum.GstCtx.cs.u32Limit      = UINT32_MAX;
    pVCpu->cpum.GstCtx.cs.fFlags        = CPUMSELREG_FLAGS_VALID;

    pVCpu->cpum.GstCtx.ss.u64Base       = 0;
    pVCpu->cpum.GstCtx.ss.u32Limit      = UINT32_MAX;
    pVCpu->cpum.GstCtx.ss.Attr.u        = X86DESCATTR_D | X86DESCATTR_G | X86DESCATTR_P | X86DESCATTR_DT
                                        | X86DESCATTR_LIMIT_HIGH | X86_SEL_TYPE_RW_ACC | (3 << X86DESCATTR_DPL_SHIFT);
    pVCpu->cpum.GstCtx.ss.fFlags        = CPUMSELREG_FLAGS_VALID;
    pVCpu->cpum.GstCtx.rflags.Bits.u1RF = 0;

    pVCpu->iem.s.uCpl                   = 3;
/** @todo single stepping   */

    /* Flush the prefetch buffer. */
    IEM_FLUSH_PREFETCH_HEAVY(pVCpu, cbInstr);

    return VINF_SUCCESS;
}


/**
 * Completes a MOV SReg,XXX or POP SReg instruction.
 *
 * When not modifying SS or when we're already in an interrupt shadow we
 * can update RIP and finish the instruction the normal way.
 *
 * Otherwise, the MOV/POP SS interrupt shadow that we now enable will block
 * both TF and DBx events.  The TF will be ignored while the DBx ones will
 * be delayed till the next instruction boundrary.  For more details see
 * @sdmv3{077,200,6.8.3,Masking Exceptions and Interrupts When Switching Stacks}.
 */
DECLINLINE(VBOXSTRICTRC) iemCImpl_LoadSRegFinish(PVMCPUCC pVCpu, uint8_t cbInstr, uint8_t iSegReg)
{
    if (iSegReg != X86_SREG_SS || CPUMIsInInterruptShadow(&pVCpu->cpum.GstCtx))
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);

    iemRegAddToRip(pVCpu, cbInstr);
    pVCpu->cpum.GstCtx.eflags.uBoth &= ~X86_EFL_RF; /* Shadow int isn't set and DRx is delayed, so only clear RF. */
    CPUMSetInInterruptShadowSs(&pVCpu->cpum.GstCtx);

    return VINF_SUCCESS;
}


/**
 * Common worker for 'pop SReg', 'mov SReg, GReg' and 'lXs GReg, reg/mem'.
 *
 * @param   pVCpu       The cross context virtual CPU structure of the calling
 *                      thread.
 * @param   iSegReg     The segment register number (valid).
 * @param   uSel        The new selector value.
 */
static VBOXSTRICTRC iemCImpl_LoadSRegWorker(PVMCPUCC pVCpu, uint8_t iSegReg, uint16_t uSel)
{
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_SREG_FROM_IDX(iSegReg));
    uint16_t       *pSel = iemSRegRef(pVCpu, iSegReg);
    PCPUMSELREGHID  pHid = iemSRegGetHid(pVCpu, iSegReg);

    Assert(iSegReg <= X86_SREG_GS && iSegReg != X86_SREG_CS);

    /*
     * Real mode and V8086 mode are easy.
     */
    if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
    {
        *pSel           = uSel;
        pHid->u64Base   = (uint32_t)uSel << 4;
        pHid->ValidSel  = uSel;
        pHid->fFlags    = CPUMSELREG_FLAGS_VALID;
#if 0 /* AMD Volume 2, chapter 4.1 - "real mode segmentation" - states that limit and attributes are untouched. */
        /** @todo Does the CPU actually load limits and attributes in the
         *        real/V8086 mode segment load case?  It doesn't for CS in far
         *        jumps...  Affects unreal mode.  */
        pHid->u32Limit          = 0xffff;
        pHid->Attr.u = 0;
        pHid->Attr.n.u1Present  = 1;
        pHid->Attr.n.u1DescType = 1;
        pHid->Attr.n.u4Type     = iSegReg != X86_SREG_CS
                                ? X86_SEL_TYPE_RW
                                : X86_SEL_TYPE_READ | X86_SEL_TYPE_CODE;
#endif
    }
    /*
     * Protected mode.
     *
     * Check if it's a null segment selector value first, that's OK for DS, ES,
     * FS and GS.  If not null, then we have to load and parse the descriptor.
     */
    else if (!(uSel & X86_SEL_MASK_OFF_RPL))
    {
        Assert(iSegReg != X86_SREG_CS); /** @todo testcase for \#UD on MOV CS, ax! */
        if (iSegReg == X86_SREG_SS)
        {
            /* In 64-bit kernel mode, the stack can be 0 because of the way
               interrupts are dispatched. AMD seems to have a slighly more
               relaxed relationship to SS.RPL than intel does. */
            /** @todo We cannot 'mov ss, 3' in 64-bit kernel mode, can we? There is a testcase (bs-cpu-xcpt-1), but double check this! */
            if (   pVCpu->iem.s.enmCpuMode != IEMMODE_64BIT
                || pVCpu->iem.s.uCpl > 2
                || (   uSel != pVCpu->iem.s.uCpl
                    && !IEM_IS_GUEST_CPU_AMD(pVCpu)) )
            {
                Log(("load sreg %#x -> invalid stack selector, #GP(0)\n", uSel));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }
        }

        *pSel = uSel;   /* Not RPL, remember :-) */
        iemHlpLoadNullDataSelectorProt(pVCpu, pHid, uSel);
        if (iSegReg == X86_SREG_SS)
            pHid->Attr.u |= pVCpu->iem.s.uCpl << X86DESCATTR_DPL_SHIFT;
    }
    else
    {

        /* Fetch the descriptor. */
        IEMSELDESC Desc;
        VBOXSTRICTRC rcStrict = iemMemFetchSelDesc(pVCpu, &Desc, uSel, X86_XCPT_GP); /** @todo Correct exception? */
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;

        /* Check GPs first. */
        if (!Desc.Legacy.Gen.u1DescType)
        {
            Log(("load sreg %d (=%#x) - system selector (%#x) -> #GP\n", iSegReg, uSel, Desc.Legacy.Gen.u4Type));
            return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
        }
        if (iSegReg == X86_SREG_SS) /* SS gets different treatment */
        {
            if (    (Desc.Legacy.Gen.u4Type & X86_SEL_TYPE_CODE)
                || !(Desc.Legacy.Gen.u4Type & X86_SEL_TYPE_WRITE) )
            {
                Log(("load sreg SS, %#x - code or read only (%#x) -> #GP\n", uSel, Desc.Legacy.Gen.u4Type));
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
            }
            if ((uSel & X86_SEL_RPL) != pVCpu->iem.s.uCpl)
            {
                Log(("load sreg SS, %#x - RPL and CPL (%d) differs -> #GP\n", uSel, pVCpu->iem.s.uCpl));
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
            }
            if (Desc.Legacy.Gen.u2Dpl != pVCpu->iem.s.uCpl)
            {
                Log(("load sreg SS, %#x - DPL (%d) and CPL (%d) differs -> #GP\n", uSel, Desc.Legacy.Gen.u2Dpl, pVCpu->iem.s.uCpl));
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
            }
        }
        else
        {
            if ((Desc.Legacy.Gen.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_READ)) == X86_SEL_TYPE_CODE)
            {
                Log(("load sreg%u, %#x - execute only segment -> #GP\n", iSegReg, uSel));
                return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
            }
            if (   (Desc.Legacy.Gen.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_CONF))
                != (X86_SEL_TYPE_CODE | X86_SEL_TYPE_CONF))
            {
#if 0 /* this is what intel says. */
                if (   (uSel & X86_SEL_RPL) > Desc.Legacy.Gen.u2Dpl
                    && pVCpu->iem.s.uCpl        > Desc.Legacy.Gen.u2Dpl)
                {
                    Log(("load sreg%u, %#x - both RPL (%d) and CPL (%d) are greater than DPL (%d) -> #GP\n",
                         iSegReg, uSel, (uSel & X86_SEL_RPL), pVCpu->iem.s.uCpl, Desc.Legacy.Gen.u2Dpl));
                    return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
                }
#else /* this is what makes more sense. */
                if ((unsigned)(uSel & X86_SEL_RPL) > Desc.Legacy.Gen.u2Dpl)
                {
                    Log(("load sreg%u, %#x - RPL (%d) is greater than DPL (%d) -> #GP\n",
                         iSegReg, uSel, (uSel & X86_SEL_RPL), Desc.Legacy.Gen.u2Dpl));
                    return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
                }
                if (pVCpu->iem.s.uCpl > Desc.Legacy.Gen.u2Dpl)
                {
                    Log(("load sreg%u, %#x - CPL (%d) is greater than DPL (%d) -> #GP\n",
                         iSegReg, uSel, pVCpu->iem.s.uCpl, Desc.Legacy.Gen.u2Dpl));
                    return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uSel);
                }
#endif
            }
        }

        /* Is it there? */
        if (!Desc.Legacy.Gen.u1Present)
        {
            Log(("load sreg%d,%#x - segment not present -> #NP\n", iSegReg, uSel));
            return iemRaiseSelectorNotPresentBySelector(pVCpu, uSel);
        }

        /* The base and limit. */
        uint32_t cbLimit = X86DESC_LIMIT_G(&Desc.Legacy);
        uint64_t u64Base = X86DESC_BASE(&Desc.Legacy);

        /*
         * Ok, everything checked out fine.  Now set the accessed bit before
         * committing the result into the registers.
         */
        if (!(Desc.Legacy.Gen.u4Type & X86_SEL_TYPE_ACCESSED))
        {
            rcStrict = iemMemMarkSelDescAccessed(pVCpu, uSel);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
            Desc.Legacy.Gen.u4Type |= X86_SEL_TYPE_ACCESSED;
        }

        /* commit */
        *pSel = uSel;
        pHid->Attr.u   = X86DESC_GET_HID_ATTR(&Desc.Legacy);
        pHid->u32Limit = cbLimit;
        pHid->u64Base  = u64Base;
        pHid->ValidSel = uSel;
        pHid->fFlags   = CPUMSELREG_FLAGS_VALID;

        /** @todo check if the hidden bits are loaded correctly for 64-bit
         *        mode.  */
    }

    Assert(CPUMSELREG_ARE_HIDDEN_PARTS_VALID(pVCpu, pHid));
    CPUMSetChangedFlags(pVCpu, CPUM_CHANGED_HIDDEN_SEL_REGS);
    return VINF_SUCCESS;
}


/**
 * Implements 'mov SReg, r/m'.
 *
 * @param   iSegReg     The segment register number (valid).
 * @param   uSel        The new selector value.
 */
IEM_CIMPL_DEF_2(iemCImpl_load_SReg, uint8_t, iSegReg, uint16_t, uSel)
{
    VBOXSTRICTRC rcStrict = iemCImpl_LoadSRegWorker(pVCpu, iSegReg, uSel);
    if (rcStrict == VINF_SUCCESS)
        rcStrict = iemCImpl_LoadSRegFinish(pVCpu, cbInstr, iSegReg);
    return rcStrict;
}


/**
 * Implements 'pop SReg'.
 *
 * @param   iSegReg         The segment register number (valid).
 * @param   enmEffOpSize    The efficient operand size (valid).
 */
IEM_CIMPL_DEF_2(iemCImpl_pop_Sreg, uint8_t, iSegReg, IEMMODE, enmEffOpSize)
{
    VBOXSTRICTRC    rcStrict;

    /*
     * Read the selector off the stack and join paths with mov ss, reg.
     */
    RTUINT64U TmpRsp;
    TmpRsp.u = pVCpu->cpum.GstCtx.rsp;
    switch (enmEffOpSize)
    {
        case IEMMODE_16BIT:
        {
            uint16_t uSel;
            rcStrict = iemMemStackPopU16Ex(pVCpu, &uSel, &TmpRsp);
            if (rcStrict == VINF_SUCCESS)
                rcStrict = iemCImpl_LoadSRegWorker(pVCpu, iSegReg, uSel);
            break;
        }

        case IEMMODE_32BIT:
        {
            uint32_t u32Value;
            rcStrict = iemMemStackPopU32Ex(pVCpu, &u32Value, &TmpRsp);
            if (rcStrict == VINF_SUCCESS)
                rcStrict = iemCImpl_LoadSRegWorker(pVCpu, iSegReg, (uint16_t)u32Value);
            break;
        }

        case IEMMODE_64BIT:
        {
            uint64_t u64Value;
            rcStrict = iemMemStackPopU64Ex(pVCpu, &u64Value, &TmpRsp);
            if (rcStrict == VINF_SUCCESS)
                rcStrict = iemCImpl_LoadSRegWorker(pVCpu, iSegReg, (uint16_t)u64Value);
            break;
        }
        IEM_NOT_REACHED_DEFAULT_CASE_RET();
    }

    /*
     * If the load succeeded, commit the stack change and finish the instruction.
     */
    if (rcStrict == VINF_SUCCESS)
    {
        pVCpu->cpum.GstCtx.rsp = TmpRsp.u;
        rcStrict = iemCImpl_LoadSRegFinish(pVCpu, cbInstr, iSegReg);
    }

    return rcStrict;
}


/**
 * Implements lgs, lfs, les, lds & lss.
 */
IEM_CIMPL_DEF_5(iemCImpl_load_SReg_Greg, uint16_t, uSel, uint64_t, offSeg, uint8_t, iSegReg, uint8_t, iGReg, IEMMODE, enmEffOpSize)
{
    /*
     * Use iemCImpl_LoadSRegWorker to do the tricky segment register loading.
     */
    /** @todo verify and test that mov, pop and lXs works the segment
     *        register loading in the exact same way. */
    VBOXSTRICTRC rcStrict = iemCImpl_LoadSRegWorker(pVCpu, iSegReg, uSel);
    if (rcStrict == VINF_SUCCESS)
    {
        switch (enmEffOpSize)
        {
            case IEMMODE_16BIT:
                *(uint16_t *)iemGRegRef(pVCpu, iGReg) = offSeg;
                break;
            case IEMMODE_32BIT:
            case IEMMODE_64BIT:
                *(uint64_t *)iemGRegRef(pVCpu, iGReg) = offSeg;
                break;
            IEM_NOT_REACHED_DEFAULT_CASE_RET();
        }
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }
    return rcStrict;
}


/**
 * Helper for VERR, VERW, LAR, and LSL and loads the descriptor into memory.
 *
 * @retval VINF_SUCCESS on success.
 * @retval VINF_IEM_SELECTOR_NOT_OK if the selector isn't ok.
 * @retval iemMemFetchSysU64 return value.
 *
 * @param   pVCpu               The cross context virtual CPU structure of the calling thread.
 * @param   uSel                The selector value.
 * @param   fAllowSysDesc       Whether system descriptors are OK or not.
 * @param   pDesc               Where to return the descriptor on success.
 */
static VBOXSTRICTRC iemCImpl_LoadDescHelper(PVMCPUCC pVCpu, uint16_t uSel, bool fAllowSysDesc, PIEMSELDESC pDesc)
{
    pDesc->Long.au64[0] = 0;
    pDesc->Long.au64[1] = 0;

    if (!(uSel & X86_SEL_MASK_OFF_RPL)) /** @todo test this on 64-bit. */
        return VINF_IEM_SELECTOR_NOT_OK;

    /* Within the table limits? */
    RTGCPTR GCPtrBase;
    if (uSel & X86_SEL_LDT)
    {
        IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_LDTR);
        if (   !pVCpu->cpum.GstCtx.ldtr.Attr.n.u1Present
            || (uSel | X86_SEL_RPL_LDT) > pVCpu->cpum.GstCtx.ldtr.u32Limit )
            return VINF_IEM_SELECTOR_NOT_OK;
        GCPtrBase = pVCpu->cpum.GstCtx.ldtr.u64Base;
    }
    else
    {
        IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_GDTR);
        if ((uSel | X86_SEL_RPL_LDT) > pVCpu->cpum.GstCtx.gdtr.cbGdt)
            return VINF_IEM_SELECTOR_NOT_OK;
        GCPtrBase = pVCpu->cpum.GstCtx.gdtr.pGdt;
    }

    /* Fetch the descriptor. */
    VBOXSTRICTRC rcStrict = iemMemFetchSysU64(pVCpu, &pDesc->Legacy.u, UINT8_MAX, GCPtrBase + (uSel & X86_SEL_MASK));
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;
    if (!pDesc->Legacy.Gen.u1DescType)
    {
        if (!fAllowSysDesc)
            return VINF_IEM_SELECTOR_NOT_OK;
        if (CPUMIsGuestInLongModeEx(IEM_GET_CTX(pVCpu)))
        {
            rcStrict = iemMemFetchSysU64(pVCpu, &pDesc->Long.au64[1], UINT8_MAX, GCPtrBase + (uSel & X86_SEL_MASK) + 8);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
        }

    }

    return VINF_SUCCESS;
}


/**
 * Implements verr (fWrite = false) and verw (fWrite = true).
 */
IEM_CIMPL_DEF_2(iemCImpl_VerX, uint16_t, uSel, bool, fWrite)
{
    Assert(!IEM_IS_REAL_OR_V86_MODE(pVCpu));

    /** @todo figure whether the accessed bit is set or not. */

    bool         fAccessible = true;
    IEMSELDESC   Desc;
    VBOXSTRICTRC rcStrict = iemCImpl_LoadDescHelper(pVCpu, uSel, false /*fAllowSysDesc*/, &Desc);
    if (rcStrict == VINF_SUCCESS)
    {
        /* Check the descriptor, order doesn't matter much here. */
        if (   !Desc.Legacy.Gen.u1DescType
            || !Desc.Legacy.Gen.u1Present)
            fAccessible = false;
        else
        {
            if (  fWrite
                ? (Desc.Legacy.Gen.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_WRITE)) != X86_SEL_TYPE_WRITE
                : (Desc.Legacy.Gen.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_READ))  == X86_SEL_TYPE_CODE)
                fAccessible = false;

            /** @todo testcase for the conforming behavior. */
            if (   (Desc.Legacy.Gen.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_CONF))
                != (X86_SEL_TYPE_CODE | X86_SEL_TYPE_CONF))
            {
                if ((unsigned)(uSel & X86_SEL_RPL) > Desc.Legacy.Gen.u2Dpl)
                    fAccessible = false;
                else if (pVCpu->iem.s.uCpl > Desc.Legacy.Gen.u2Dpl)
                    fAccessible = false;
            }
        }

    }
    else if (rcStrict == VINF_IEM_SELECTOR_NOT_OK)
        fAccessible = false;
    else
        return rcStrict;

    /* commit */
    pVCpu->cpum.GstCtx.eflags.Bits.u1ZF = fAccessible;

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements LAR and LSL with 64-bit operand size.
 *
 * @returns VINF_SUCCESS.
 * @param   pu64Dst         Pointer to the destination register.
 * @param   uSel            The selector to load details for.
 * @param   fIsLar          true = LAR, false = LSL.
 */
IEM_CIMPL_DEF_3(iemCImpl_LarLsl_u64, uint64_t *, pu64Dst, uint16_t, uSel, bool, fIsLar)
{
    Assert(!IEM_IS_REAL_OR_V86_MODE(pVCpu));

    /** @todo figure whether the accessed bit is set or not. */

    bool         fDescOk = true;
    IEMSELDESC   Desc;
    VBOXSTRICTRC rcStrict = iemCImpl_LoadDescHelper(pVCpu, uSel, true /*fAllowSysDesc*/, &Desc);
    if (rcStrict == VINF_SUCCESS)
    {
        /*
         * Check the descriptor type.
         */
        if (!Desc.Legacy.Gen.u1DescType)
        {
            if (CPUMIsGuestInLongModeEx(IEM_GET_CTX(pVCpu)))
            {
                if (Desc.Long.Gen.u5Zeros)
                    fDescOk = false;
                else
                    switch (Desc.Long.Gen.u4Type)
                    {
                        /** @todo Intel lists 0 as valid for LSL, verify whether that's correct */
                        case AMD64_SEL_TYPE_SYS_TSS_AVAIL:
                        case AMD64_SEL_TYPE_SYS_TSS_BUSY:
                        case AMD64_SEL_TYPE_SYS_LDT: /** @todo Intel lists this as invalid for LAR, AMD and 32-bit does otherwise. */
                            break;
                        case AMD64_SEL_TYPE_SYS_CALL_GATE:
                            fDescOk = fIsLar;
                            break;
                        default:
                            fDescOk = false;
                            break;
                    }
            }
            else
            {
                switch (Desc.Long.Gen.u4Type)
                {
                    case X86_SEL_TYPE_SYS_286_TSS_AVAIL:
                    case X86_SEL_TYPE_SYS_286_TSS_BUSY:
                    case X86_SEL_TYPE_SYS_386_TSS_AVAIL:
                    case X86_SEL_TYPE_SYS_386_TSS_BUSY:
                    case X86_SEL_TYPE_SYS_LDT:
                        break;
                    case X86_SEL_TYPE_SYS_286_CALL_GATE:
                    case X86_SEL_TYPE_SYS_TASK_GATE:
                    case X86_SEL_TYPE_SYS_386_CALL_GATE:
                        fDescOk = fIsLar;
                        break;
                    default:
                        fDescOk = false;
                        break;
                }
            }
        }
        if (fDescOk)
        {
            /*
             * Check the RPL/DPL/CPL interaction..
             */
            /** @todo testcase for the conforming behavior. */
            if (   (Desc.Legacy.Gen.u4Type & (X86_SEL_TYPE_CODE | X86_SEL_TYPE_CONF)) != (X86_SEL_TYPE_CODE | X86_SEL_TYPE_CONF)
                || !Desc.Legacy.Gen.u1DescType)
            {
                if ((unsigned)(uSel & X86_SEL_RPL) > Desc.Legacy.Gen.u2Dpl)
                    fDescOk = false;
                else if (pVCpu->iem.s.uCpl > Desc.Legacy.Gen.u2Dpl)
                    fDescOk = false;
            }
        }

        if (fDescOk)
        {
            /*
             * All fine, start committing the result.
             */
            if (fIsLar)
                *pu64Dst = Desc.Legacy.au32[1] & UINT32_C(0x00ffff00);
            else
                *pu64Dst = X86DESC_LIMIT_G(&Desc.Legacy);
        }

    }
    else if (rcStrict == VINF_IEM_SELECTOR_NOT_OK)
        fDescOk = false;
    else
        return rcStrict;

    /* commit flags value and advance rip. */
    pVCpu->cpum.GstCtx.eflags.Bits.u1ZF = fDescOk;
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements LAR and LSL with 16-bit operand size.
 *
 * @returns VINF_SUCCESS.
 * @param   pu16Dst         Pointer to the destination register.
 * @param   uSel            The selector to load details for.
 * @param   fIsLar          true = LAR, false = LSL.
 */
IEM_CIMPL_DEF_3(iemCImpl_LarLsl_u16, uint16_t *, pu16Dst, uint16_t, uSel, bool, fIsLar)
{
    uint64_t u64TmpDst = *pu16Dst;
    IEM_CIMPL_CALL_3(iemCImpl_LarLsl_u64, &u64TmpDst, uSel, fIsLar);
    *pu16Dst = u64TmpDst;
    return VINF_SUCCESS;
}


/**
 * Implements lgdt.
 *
 * @param   iEffSeg         The segment of the new gdtr contents
 * @param   GCPtrEffSrc     The address of the new gdtr contents.
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_3(iemCImpl_lgdt, uint8_t, iEffSeg, RTGCPTR, GCPtrEffSrc, IEMMODE, enmEffOpSize)
{
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);

    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_DESC_TABLE_EXIT))
    {
        Log(("lgdt: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_NEEDS_INFO_RET(pVCpu, VMX_EXIT_GDTR_IDTR_ACCESS, VMXINSTRID_LGDT, cbInstr);
    }

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_GDTR_WRITES))
    {
        Log(("lgdt: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_GDTR_WRITE, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * Fetch the limit and base address.
     */
    uint16_t cbLimit;
    RTGCPTR  GCPtrBase;
    VBOXSTRICTRC rcStrict = iemMemFetchDataXdtr(pVCpu, &cbLimit, &GCPtrBase, iEffSeg, GCPtrEffSrc, enmEffOpSize);
    if (rcStrict == VINF_SUCCESS)
    {
        if (   pVCpu->iem.s.enmCpuMode != IEMMODE_64BIT
            || X86_IS_CANONICAL(GCPtrBase))
        {
            rcStrict = CPUMSetGuestGDTR(pVCpu, GCPtrBase, cbLimit);
            if (rcStrict == VINF_SUCCESS)
                rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }
        else
        {
            Log(("iemCImpl_lgdt: Non-canonical base %04x:%RGv\n", cbLimit, GCPtrBase));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }
    }
    return rcStrict;
}


/**
 * Implements sgdt.
 *
 * @param   iEffSeg         The segment where to store the gdtr content.
 * @param   GCPtrEffDst     The address where to store the gdtr content.
 */
IEM_CIMPL_DEF_2(iemCImpl_sgdt, uint8_t, iEffSeg, RTGCPTR, GCPtrEffDst)
{
    /*
     * Join paths with sidt.
     * Note! No CPL or V8086 checks here, it's a really sad story, ask Intel if
     *       you really must know.
     */
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_DESC_TABLE_EXIT))
    {
        Log(("sgdt: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_NEEDS_INFO_RET(pVCpu, VMX_EXIT_GDTR_IDTR_ACCESS, VMXINSTRID_SGDT, cbInstr);
    }

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_GDTR_READS))
    {
        Log(("sgdt: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_GDTR_READ, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_GDTR);
    VBOXSTRICTRC rcStrict = iemMemStoreDataXdtr(pVCpu, pVCpu->cpum.GstCtx.gdtr.cbGdt, pVCpu->cpum.GstCtx.gdtr.pGdt, iEffSeg, GCPtrEffDst);
    if (rcStrict == VINF_SUCCESS)
        rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    return rcStrict;
}


/**
 * Implements lidt.
 *
 * @param   iEffSeg         The segment of the new idtr contents
 * @param   GCPtrEffSrc     The address of the new idtr contents.
 * @param   enmEffOpSize    The effective operand size.
 */
IEM_CIMPL_DEF_3(iemCImpl_lidt, uint8_t, iEffSeg, RTGCPTR, GCPtrEffSrc, IEMMODE, enmEffOpSize)
{
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_IDTR_WRITES))
    {
        Log(("lidt: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_IDTR_WRITE, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * Fetch the limit and base address.
     */
    uint16_t cbLimit;
    RTGCPTR  GCPtrBase;
    VBOXSTRICTRC rcStrict = iemMemFetchDataXdtr(pVCpu, &cbLimit, &GCPtrBase, iEffSeg, GCPtrEffSrc, enmEffOpSize);
    if (rcStrict == VINF_SUCCESS)
    {
        if (   pVCpu->iem.s.enmCpuMode != IEMMODE_64BIT
            || X86_IS_CANONICAL(GCPtrBase))
        {
            CPUMSetGuestIDTR(pVCpu, GCPtrBase, cbLimit);
            rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }
        else
        {
            Log(("iemCImpl_lidt: Non-canonical base %04x:%RGv\n", cbLimit, GCPtrBase));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }
    }
    return rcStrict;
}


/**
 * Implements sidt.
 *
 * @param   iEffSeg         The segment where to store the idtr content.
 * @param   GCPtrEffDst     The address where to store the idtr content.
 */
IEM_CIMPL_DEF_2(iemCImpl_sidt, uint8_t, iEffSeg, RTGCPTR, GCPtrEffDst)
{
    /*
     * Join paths with sgdt.
     * Note! No CPL or V8086 checks here, it's a really sad story, ask Intel if
     *       you really must know.
     */
    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_IDTR_READS))
    {
        Log(("sidt: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_IDTR_READ, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_IDTR);
    VBOXSTRICTRC rcStrict = iemMemStoreDataXdtr(pVCpu, pVCpu->cpum.GstCtx.idtr.cbIdt, pVCpu->cpum.GstCtx.idtr.pIdt, iEffSeg, GCPtrEffDst);
    if (rcStrict == VINF_SUCCESS)
        rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    return rcStrict;
}


/**
 * Implements lldt.
 *
 * @param   uNewLdt     The new LDT selector value.
 */
IEM_CIMPL_DEF_1(iemCImpl_lldt, uint16_t, uNewLdt)
{
    /*
     * Check preconditions.
     */
    if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
    {
        Log(("lldt %04x - real or v8086 mode -> #GP(0)\n", uNewLdt));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (pVCpu->iem.s.uCpl != 0)
    {
        Log(("lldt %04x - CPL is %d -> #GP(0)\n", uNewLdt, pVCpu->iem.s.uCpl));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    /* Nested-guest VMX intercept. */
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_DESC_TABLE_EXIT))
    {
        Log(("lldt: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_NEEDS_INFO_RET(pVCpu, VMX_EXIT_LDTR_TR_ACCESS, VMXINSTRID_LLDT, cbInstr);
    }
    if (uNewLdt & X86_SEL_LDT)
    {
        Log(("lldt %04x - LDT selector -> #GP\n", uNewLdt));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewLdt);
    }

    /*
     * Now, loading a NULL selector is easy.
     */
    if (!(uNewLdt & X86_SEL_MASK_OFF_RPL))
    {
        /* Nested-guest SVM intercept. */
        if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_LDTR_WRITES))
        {
            Log(("lldt: Guest intercept -> #VMEXIT\n"));
            IEM_SVM_UPDATE_NRIP(pVCpu);
            IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_LDTR_WRITE, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
        }

        Log(("lldt %04x: Loading NULL selector.\n", uNewLdt));
        pVCpu->cpum.GstCtx.fExtrn &= ~CPUMCTX_EXTRN_LDTR;
        CPUMSetGuestLDTR(pVCpu, uNewLdt);
        pVCpu->cpum.GstCtx.ldtr.ValidSel = uNewLdt;
        pVCpu->cpum.GstCtx.ldtr.fFlags   = CPUMSELREG_FLAGS_VALID;
        if (IEM_IS_GUEST_CPU_AMD(pVCpu))
        {
            /* AMD-V seems to leave the base and limit alone. */
            pVCpu->cpum.GstCtx.ldtr.Attr.u = X86DESCATTR_UNUSABLE;
        }
        else
        {
            /* VT-x (Intel 3960x) seems to be doing the following. */
            pVCpu->cpum.GstCtx.ldtr.Attr.u   = X86DESCATTR_UNUSABLE | X86DESCATTR_G | X86DESCATTR_D;
            pVCpu->cpum.GstCtx.ldtr.u64Base  = 0;
            pVCpu->cpum.GstCtx.ldtr.u32Limit = UINT32_MAX;
        }

        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }

    /*
     * Read the descriptor.
     */
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_LDTR | CPUMCTX_EXTRN_GDTR);
    IEMSELDESC Desc;
    VBOXSTRICTRC rcStrict = iemMemFetchSelDesc(pVCpu, &Desc, uNewLdt, X86_XCPT_GP); /** @todo Correct exception? */
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Check GPs first. */
    if (Desc.Legacy.Gen.u1DescType)
    {
        Log(("lldt %#x - not system selector (type %x) -> #GP\n", uNewLdt, Desc.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFault(pVCpu, uNewLdt & X86_SEL_MASK_OFF_RPL);
    }
    if (Desc.Legacy.Gen.u4Type != X86_SEL_TYPE_SYS_LDT)
    {
        Log(("lldt %#x - not LDT selector (type %x) -> #GP\n", uNewLdt, Desc.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFault(pVCpu, uNewLdt & X86_SEL_MASK_OFF_RPL);
    }
    uint64_t u64Base;
    if (!IEM_IS_LONG_MODE(pVCpu))
        u64Base = X86DESC_BASE(&Desc.Legacy);
    else
    {
        if (Desc.Long.Gen.u5Zeros)
        {
            Log(("lldt %#x - u5Zeros=%#x -> #GP\n", uNewLdt, Desc.Long.Gen.u5Zeros));
            return iemRaiseGeneralProtectionFault(pVCpu, uNewLdt & X86_SEL_MASK_OFF_RPL);
        }

        u64Base = X86DESC64_BASE(&Desc.Long);
        if (!IEM_IS_CANONICAL(u64Base))
        {
            Log(("lldt %#x - non-canonical base address %#llx -> #GP\n", uNewLdt, u64Base));
            return iemRaiseGeneralProtectionFault(pVCpu, uNewLdt & X86_SEL_MASK_OFF_RPL);
        }
    }

    /* NP */
    if (!Desc.Legacy.Gen.u1Present)
    {
        Log(("lldt %#x - segment not present -> #NP\n", uNewLdt));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uNewLdt);
    }

    /* Nested-guest SVM intercept. */
    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_LDTR_WRITES))
    {
        Log(("lldt: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_LDTR_WRITE, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * It checks out alright, update the registers.
     */
/** @todo check if the actual value is loaded or if the RPL is dropped */
    CPUMSetGuestLDTR(pVCpu, uNewLdt & X86_SEL_MASK_OFF_RPL);
    pVCpu->cpum.GstCtx.ldtr.ValidSel = uNewLdt & X86_SEL_MASK_OFF_RPL;
    pVCpu->cpum.GstCtx.ldtr.fFlags   = CPUMSELREG_FLAGS_VALID;
    pVCpu->cpum.GstCtx.ldtr.Attr.u   = X86DESC_GET_HID_ATTR(&Desc.Legacy);
    pVCpu->cpum.GstCtx.ldtr.u32Limit = X86DESC_LIMIT_G(&Desc.Legacy);
    pVCpu->cpum.GstCtx.ldtr.u64Base  = u64Base;

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements sldt GReg
 *
 * @param   iGReg           The general register to store the CRx value in.
 * @param   enmEffOpSize    The operand size.
 */
IEM_CIMPL_DEF_2(iemCImpl_sldt_reg, uint8_t, iGReg, uint8_t, enmEffOpSize)
{
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_DESC_TABLE_EXIT))
    {
        Log(("sldt: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_NEEDS_INFO_RET(pVCpu, VMX_EXIT_LDTR_TR_ACCESS, VMXINSTRID_SLDT, cbInstr);
    }

    IEM_SVM_CHECK_INSTR_INTERCEPT(pVCpu, SVM_CTRL_INTERCEPT_LDTR_READS, SVM_EXIT_LDTR_READ, 0, 0);

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_LDTR);
    switch (enmEffOpSize)
    {
        case IEMMODE_16BIT: *(uint16_t *)iemGRegRef(pVCpu, iGReg) = pVCpu->cpum.GstCtx.ldtr.Sel; break;
        case IEMMODE_32BIT: *(uint64_t *)iemGRegRef(pVCpu, iGReg) = pVCpu->cpum.GstCtx.ldtr.Sel; break;
        case IEMMODE_64BIT: *(uint64_t *)iemGRegRef(pVCpu, iGReg) = pVCpu->cpum.GstCtx.ldtr.Sel; break;
        IEM_NOT_REACHED_DEFAULT_CASE_RET();
    }
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements sldt mem.
 *
 * @param   iEffSeg         The effective segment register to use with @a GCPtrMem.
 * @param   GCPtrEffDst     Where to store the 16-bit CR0 value.
 */
IEM_CIMPL_DEF_2(iemCImpl_sldt_mem, uint8_t, iEffSeg, RTGCPTR, GCPtrEffDst)
{
    IEM_SVM_CHECK_INSTR_INTERCEPT(pVCpu, SVM_CTRL_INTERCEPT_LDTR_READS, SVM_EXIT_LDTR_READ, 0, 0);

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_LDTR);
    VBOXSTRICTRC rcStrict = iemMemStoreDataU16(pVCpu, iEffSeg, GCPtrEffDst, pVCpu->cpum.GstCtx.ldtr.Sel);
    if (rcStrict == VINF_SUCCESS)
        rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    return rcStrict;
}


/**
 * Implements ltr.
 *
 * @param   uNewTr      The new TSS selector value.
 */
IEM_CIMPL_DEF_1(iemCImpl_ltr, uint16_t, uNewTr)
{
    /*
     * Check preconditions.
     */
    if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
    {
        Log(("ltr %04x - real or v8086 mode -> #GP(0)\n", uNewTr));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (pVCpu->iem.s.uCpl != 0)
    {
        Log(("ltr %04x - CPL is %d -> #GP(0)\n", uNewTr, pVCpu->iem.s.uCpl));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_DESC_TABLE_EXIT))
    {
        Log(("ltr: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_NEEDS_INFO_RET(pVCpu, VMX_EXIT_LDTR_TR_ACCESS, VMXINSTRID_LTR, cbInstr);
    }
    if (uNewTr & X86_SEL_LDT)
    {
        Log(("ltr %04x - LDT selector -> #GP\n", uNewTr));
        return iemRaiseGeneralProtectionFaultBySelector(pVCpu, uNewTr);
    }
    if (!(uNewTr & X86_SEL_MASK_OFF_RPL))
    {
        Log(("ltr %04x - NULL selector -> #GP(0)\n", uNewTr));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_TR_WRITES))
    {
        Log(("ltr: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_TR_WRITE, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * Read the descriptor.
     */
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_LDTR | CPUMCTX_EXTRN_GDTR | CPUMCTX_EXTRN_TR);
    IEMSELDESC Desc;
    VBOXSTRICTRC rcStrict = iemMemFetchSelDesc(pVCpu, &Desc, uNewTr, X86_XCPT_GP); /** @todo Correct exception? */
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Check GPs first. */
    if (Desc.Legacy.Gen.u1DescType)
    {
        Log(("ltr %#x - not system selector (type %x) -> #GP\n", uNewTr, Desc.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFault(pVCpu, uNewTr & X86_SEL_MASK_OFF_RPL);
    }
    if (   Desc.Legacy.Gen.u4Type != X86_SEL_TYPE_SYS_386_TSS_AVAIL /* same as AMD64_SEL_TYPE_SYS_TSS_AVAIL */
        && (   Desc.Legacy.Gen.u4Type != X86_SEL_TYPE_SYS_286_TSS_AVAIL
            || IEM_IS_LONG_MODE(pVCpu)) )
    {
        Log(("ltr %#x - not an available TSS selector (type %x) -> #GP\n", uNewTr, Desc.Legacy.Gen.u4Type));
        return iemRaiseGeneralProtectionFault(pVCpu, uNewTr & X86_SEL_MASK_OFF_RPL);
    }
    uint64_t u64Base;
    if (!IEM_IS_LONG_MODE(pVCpu))
        u64Base = X86DESC_BASE(&Desc.Legacy);
    else
    {
        if (Desc.Long.Gen.u5Zeros)
        {
            Log(("ltr %#x - u5Zeros=%#x -> #GP\n", uNewTr, Desc.Long.Gen.u5Zeros));
            return iemRaiseGeneralProtectionFault(pVCpu, uNewTr & X86_SEL_MASK_OFF_RPL);
        }

        u64Base = X86DESC64_BASE(&Desc.Long);
        if (!IEM_IS_CANONICAL(u64Base))
        {
            Log(("ltr %#x - non-canonical base address %#llx -> #GP\n", uNewTr, u64Base));
            return iemRaiseGeneralProtectionFault(pVCpu, uNewTr & X86_SEL_MASK_OFF_RPL);
        }
    }

    /* NP */
    if (!Desc.Legacy.Gen.u1Present)
    {
        Log(("ltr %#x - segment not present -> #NP\n", uNewTr));
        return iemRaiseSelectorNotPresentBySelector(pVCpu, uNewTr);
    }

    /*
     * Set it busy.
     * Note! Intel says this should lock down the whole descriptor, but we'll
     *       restrict our selves to 32-bit for now due to lack of inline
     *       assembly and such.
     */
    void *pvDesc;
    rcStrict = iemMemMap(pVCpu, &pvDesc, 8, UINT8_MAX, pVCpu->cpum.GstCtx.gdtr.pGdt + (uNewTr & X86_SEL_MASK_OFF_RPL),
                         IEM_ACCESS_DATA_RW, 0);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;
    switch ((uintptr_t)pvDesc & 3)
    {
        case 0: ASMAtomicBitSet(pvDesc, 40 + 1); break;
        case 1: ASMAtomicBitSet((uint8_t *)pvDesc + 3, 40 + 1 - 24); break;
        case 2: ASMAtomicBitSet((uint8_t *)pvDesc + 2, 40 + 1 - 16); break;
        case 3: ASMAtomicBitSet((uint8_t *)pvDesc + 1, 40 + 1 -  8); break;
    }
    rcStrict = iemMemCommitAndUnmap(pVCpu, pvDesc, IEM_ACCESS_DATA_RW);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;
    Desc.Legacy.Gen.u4Type |= X86_SEL_TYPE_SYS_TSS_BUSY_MASK;

    /*
     * It checks out alright, update the registers.
     */
/** @todo check if the actual value is loaded or if the RPL is dropped */
    CPUMSetGuestTR(pVCpu, uNewTr & X86_SEL_MASK_OFF_RPL);
    pVCpu->cpum.GstCtx.tr.ValidSel = uNewTr & X86_SEL_MASK_OFF_RPL;
    pVCpu->cpum.GstCtx.tr.fFlags   = CPUMSELREG_FLAGS_VALID;
    pVCpu->cpum.GstCtx.tr.Attr.u   = X86DESC_GET_HID_ATTR(&Desc.Legacy);
    pVCpu->cpum.GstCtx.tr.u32Limit = X86DESC_LIMIT_G(&Desc.Legacy);
    pVCpu->cpum.GstCtx.tr.u64Base  = u64Base;

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements str GReg
 *
 * @param   iGReg           The general register to store the CRx value in.
 * @param   enmEffOpSize    The operand size.
 */
IEM_CIMPL_DEF_2(iemCImpl_str_reg, uint8_t, iGReg, uint8_t, enmEffOpSize)
{
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_DESC_TABLE_EXIT))
    {
        Log(("str_reg: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_NEEDS_INFO_RET(pVCpu, VMX_EXIT_LDTR_TR_ACCESS, VMXINSTRID_STR, cbInstr);
    }

    IEM_SVM_CHECK_INSTR_INTERCEPT(pVCpu, SVM_CTRL_INTERCEPT_TR_READS, SVM_EXIT_TR_READ, 0, 0);

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_TR);
    switch (enmEffOpSize)
    {
        case IEMMODE_16BIT: *(uint16_t *)iemGRegRef(pVCpu, iGReg) = pVCpu->cpum.GstCtx.tr.Sel; break;
        case IEMMODE_32BIT: *(uint64_t *)iemGRegRef(pVCpu, iGReg) = pVCpu->cpum.GstCtx.tr.Sel; break;
        case IEMMODE_64BIT: *(uint64_t *)iemGRegRef(pVCpu, iGReg) = pVCpu->cpum.GstCtx.tr.Sel; break;
        IEM_NOT_REACHED_DEFAULT_CASE_RET();
    }
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements str mem.
 *
 * @param   iEffSeg         The effective segment register to use with @a GCPtrMem.
 * @param   GCPtrEffDst     Where to store the 16-bit CR0 value.
 */
IEM_CIMPL_DEF_2(iemCImpl_str_mem, uint8_t, iEffSeg, RTGCPTR, GCPtrEffDst)
{
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_DESC_TABLE_EXIT))
    {
        Log(("str_mem: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_NEEDS_INFO_RET(pVCpu, VMX_EXIT_LDTR_TR_ACCESS, VMXINSTRID_STR, cbInstr);
    }

    IEM_SVM_CHECK_INSTR_INTERCEPT(pVCpu, SVM_CTRL_INTERCEPT_TR_READS, SVM_EXIT_TR_READ, 0, 0);

    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_TR);
    VBOXSTRICTRC rcStrict = iemMemStoreDataU16(pVCpu, iEffSeg, GCPtrEffDst, pVCpu->cpum.GstCtx.tr.Sel);
    if (rcStrict == VINF_SUCCESS)
        rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    return rcStrict;
}


/**
 * Implements mov GReg,CRx.
 *
 * @param   iGReg           The general register to store the CRx value in.
 * @param   iCrReg          The CRx register to read (valid).
 */
IEM_CIMPL_DEF_2(iemCImpl_mov_Rd_Cd, uint8_t, iGReg, uint8_t, iCrReg)
{
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);

    if (IEM_SVM_IS_READ_CR_INTERCEPT_SET(pVCpu, iCrReg))
    {
        Log(("iemCImpl_mov_Rd_Cd: Guest intercept CR%u -> #VMEXIT\n", iCrReg));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_CRX_VMEXIT_RET(pVCpu, SVM_EXIT_READ_CR0 + iCrReg, IEMACCESSCRX_MOV_CRX, iGReg);
    }

    /* Read it. */
    uint64_t crX;
    switch (iCrReg)
    {
        case 0:
            IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0);
            crX = pVCpu->cpum.GstCtx.cr0;
            if (IEM_GET_TARGET_CPU(pVCpu) <= IEMTARGETCPU_386)
                crX |= UINT32_C(0x7fffffe0); /* All reserved CR0 flags are set on a 386, just like MSW on 286. */
            break;
        case 2:
            IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_CR2);
            crX = pVCpu->cpum.GstCtx.cr2;
            break;
        case 3:
            IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR3);
            crX = pVCpu->cpum.GstCtx.cr3;
            break;
        case 4:
            IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR4);
            crX = pVCpu->cpum.GstCtx.cr4;
            break;
        case 8:
        {
            IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_APIC_TPR);
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
            if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
            {
                VBOXSTRICTRC rcStrict = iemVmxVmexitInstrMovFromCr8(pVCpu, iGReg, cbInstr);
                if (rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE)
                    return rcStrict;

                /*
                 * If the Mov-from-CR8 doesn't cause a VM-exit, bits 7:4 of the VTPR is copied
                 * to bits 0:3 of the destination operand. Bits 63:4 of the destination operand
                 * are cleared.
                 *
                 * See Intel Spec. 29.3 "Virtualizing CR8-based TPR Accesses"
                 */
                if (IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_USE_TPR_SHADOW))
                {
                    uint32_t const uTpr = iemVmxVirtApicReadRaw32(pVCpu, XAPIC_OFF_TPR);
                    crX = (uTpr >> 4) & 0xf;
                    break;
                }
            }
#endif
#ifdef VBOX_WITH_NESTED_HWVIRT_SVM
            if (CPUMIsGuestInSvmNestedHwVirtMode(IEM_GET_CTX(pVCpu)))
            {
                PCSVMVMCBCTRL pVmcbCtrl = &pVCpu->cpum.GstCtx.hwvirt.svm.Vmcb.ctrl;
                if (CPUMIsGuestSvmVirtIntrMasking(pVCpu, IEM_GET_CTX(pVCpu)))
                {
                    crX = pVmcbCtrl->IntCtrl.n.u8VTPR & 0xf;
                    break;
                }
            }
#endif
            uint8_t uTpr;
            int rc = APICGetTpr(pVCpu, &uTpr, NULL, NULL);
            if (RT_SUCCESS(rc))
                crX = uTpr >> 4;
            else
                crX = 0;
            break;
        }
        IEM_NOT_REACHED_DEFAULT_CASE_RET(); /* call checks */
    }

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        switch (iCrReg)
        {
            /* CR0/CR4 reads are subject to masking when in VMX non-root mode. */
            case 0: crX = CPUMGetGuestVmxMaskedCr0(&pVCpu->cpum.GstCtx, pVCpu->cpum.GstCtx.hwvirt.vmx.Vmcs.u64Cr0Mask.u); break;
            case 4: crX = CPUMGetGuestVmxMaskedCr4(&pVCpu->cpum.GstCtx, pVCpu->cpum.GstCtx.hwvirt.vmx.Vmcs.u64Cr4Mask.u); break;

            case 3:
            {
                VBOXSTRICTRC rcStrict = iemVmxVmexitInstrMovFromCr3(pVCpu, iGReg, cbInstr);
                if (rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE)
                    return rcStrict;
                break;
            }
        }
    }
#endif

    /* Store it. */
    if (pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT)
        *(uint64_t *)iemGRegRef(pVCpu, iGReg) = crX;
    else
        *(uint64_t *)iemGRegRef(pVCpu, iGReg) = (uint32_t)crX;

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements smsw GReg.
 *
 * @param   iGReg           The general register to store the CRx value in.
 * @param   enmEffOpSize    The operand size.
 */
IEM_CIMPL_DEF_2(iemCImpl_smsw_reg, uint8_t, iGReg, uint8_t, enmEffOpSize)
{
    IEM_SVM_CHECK_READ_CR0_INTERCEPT(pVCpu, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    uint64_t u64MaskedCr0;
    if (!IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
        u64MaskedCr0 = pVCpu->cpum.GstCtx.cr0;
    else
        u64MaskedCr0 = CPUMGetGuestVmxMaskedCr0(&pVCpu->cpum.GstCtx, pVCpu->cpum.GstCtx.hwvirt.vmx.Vmcs.u64Cr0Mask.u);
    uint64_t const u64GuestCr0 = u64MaskedCr0;
#else
    uint64_t const u64GuestCr0 = pVCpu->cpum.GstCtx.cr0;
#endif

    switch (enmEffOpSize)
    {
        case IEMMODE_16BIT:
            if (IEM_GET_TARGET_CPU(pVCpu) > IEMTARGETCPU_386)
                *(uint16_t *)iemGRegRef(pVCpu, iGReg) = (uint16_t)u64GuestCr0;
            else if (IEM_GET_TARGET_CPU(pVCpu) >= IEMTARGETCPU_386)
                *(uint16_t *)iemGRegRef(pVCpu, iGReg) = (uint16_t)u64GuestCr0 | 0xffe0;
            else
                *(uint16_t *)iemGRegRef(pVCpu, iGReg) = (uint16_t)u64GuestCr0 | 0xfff0;
            break;

        case IEMMODE_32BIT:
            *(uint32_t *)iemGRegRef(pVCpu, iGReg) = (uint32_t)u64GuestCr0;
            break;

        case IEMMODE_64BIT:
            *(uint64_t *)iemGRegRef(pVCpu, iGReg) = u64GuestCr0;
            break;

        IEM_NOT_REACHED_DEFAULT_CASE_RET();
    }

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements smsw mem.
 *
 * @param   iEffSeg         The effective segment register to use with @a GCPtrMem.
 * @param   GCPtrEffDst     Where to store the 16-bit CR0 value.
 */
IEM_CIMPL_DEF_2(iemCImpl_smsw_mem, uint8_t, iEffSeg, RTGCPTR, GCPtrEffDst)
{
    IEM_SVM_CHECK_READ_CR0_INTERCEPT(pVCpu, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    uint64_t u64MaskedCr0;
    if (!IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
        u64MaskedCr0 = pVCpu->cpum.GstCtx.cr0;
    else
        u64MaskedCr0 = CPUMGetGuestVmxMaskedCr0(&pVCpu->cpum.GstCtx, pVCpu->cpum.GstCtx.hwvirt.vmx.Vmcs.u64Cr0Mask.u);
    uint64_t const u64GuestCr0 = u64MaskedCr0;
#else
    uint64_t const u64GuestCr0 = pVCpu->cpum.GstCtx.cr0;
#endif

    uint16_t u16Value;
    if (IEM_GET_TARGET_CPU(pVCpu) > IEMTARGETCPU_386)
        u16Value = (uint16_t)u64GuestCr0;
    else if (IEM_GET_TARGET_CPU(pVCpu) >= IEMTARGETCPU_386)
        u16Value = (uint16_t)u64GuestCr0 | 0xffe0;
    else
        u16Value = (uint16_t)u64GuestCr0 | 0xfff0;

    VBOXSTRICTRC rcStrict = iemMemStoreDataU16(pVCpu, iEffSeg, GCPtrEffDst, u16Value);
    if (rcStrict == VINF_SUCCESS)
        rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    return rcStrict;
}


/**
 * Helper for mapping CR3 and PAE PDPEs for 'mov CRx,GReg'.
 */
#define IEM_MAP_PAE_PDPES_AT_CR3_RET(a_pVCpu, a_iCrReg, a_uCr3) \
    do \
    { \
        int const rcX = PGMGstMapPaePdpesAtCr3(a_pVCpu, a_uCr3); \
        if (RT_SUCCESS(rcX)) \
        { /* likely */ } \
        else \
        { \
            /* Either invalid PDPTEs or CR3 second-level translation failed. Raise #GP(0) either way. */ \
            Log(("iemCImpl_load_Cr%#x: Trying to load invalid PAE PDPEs\n", a_iCrReg)); \
            return iemRaiseGeneralProtectionFault0(a_pVCpu); \
        } \
    } while (0)


/**
 * Used to implemented 'mov CRx,GReg' and 'lmsw r/m16'.
 *
 * @param   iCrReg          The CRx register to write (valid).
 * @param   uNewCrX         The new value.
 * @param   enmAccessCrX    The instruction that caused the CrX load.
 * @param   iGReg           The general register in case of a 'mov CRx,GReg'
 *                          instruction.
 */
IEM_CIMPL_DEF_4(iemCImpl_load_CrX, uint8_t, iCrReg, uint64_t, uNewCrX, IEMACCESSCRX, enmAccessCrX, uint8_t, iGReg)
{
    VBOXSTRICTRC    rcStrict;
    int             rc;
#ifndef VBOX_WITH_NESTED_HWVIRT_SVM
    RT_NOREF2(iGReg, enmAccessCrX);
#endif

    /*
     * Try store it.
     * Unfortunately, CPUM only does a tiny bit of the work.
     */
    switch (iCrReg)
    {
        case 0:
        {
            /*
             * Perform checks.
             */
            IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0);

            uint64_t const uOldCrX = pVCpu->cpum.GstCtx.cr0;
            uint32_t const fValid  = CPUMGetGuestCR0ValidMask();

            /* ET is hardcoded on 486 and later. */
            if (IEM_GET_TARGET_CPU(pVCpu) > IEMTARGETCPU_486)
                uNewCrX |= X86_CR0_ET;
            /* The 386 and 486 didn't #GP(0) on attempting to set reserved CR0 bits. ET was settable on 386. */
            else if (IEM_GET_TARGET_CPU(pVCpu) == IEMTARGETCPU_486)
            {
                uNewCrX &= fValid;
                uNewCrX |= X86_CR0_ET;
            }
            else
                uNewCrX &= X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG | X86_CR0_ET;

            /* Check for reserved bits. */
            if (uNewCrX & ~(uint64_t)fValid)
            {
                Log(("Trying to set reserved CR0 bits: NewCR0=%#llx InvalidBits=%#llx\n", uNewCrX, uNewCrX & ~(uint64_t)fValid));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }

            /* Check for invalid combinations. */
            if (    (uNewCrX & X86_CR0_PG)
                && !(uNewCrX & X86_CR0_PE) )
            {
                Log(("Trying to set CR0.PG without CR0.PE\n"));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }

            if (   !(uNewCrX & X86_CR0_CD)
                && (uNewCrX & X86_CR0_NW) )
            {
                Log(("Trying to clear CR0.CD while leaving CR0.NW set\n"));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }

            if (   !(uNewCrX & X86_CR0_PG)
                && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_PCIDE))
            {
                Log(("Trying to clear CR0.PG while leaving CR4.PCID set\n"));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }

            /* Long mode consistency checks. */
            if (    (uNewCrX & X86_CR0_PG)
                && !(uOldCrX & X86_CR0_PG)
                &&  (pVCpu->cpum.GstCtx.msrEFER & MSR_K6_EFER_LME) )
            {
                if (!(pVCpu->cpum.GstCtx.cr4 & X86_CR4_PAE))
                {
                    Log(("Trying to enabled long mode paging without CR4.PAE set\n"));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }
                if (pVCpu->cpum.GstCtx.cs.Attr.n.u1Long)
                {
                    Log(("Trying to enabled long mode paging with a long CS descriptor loaded.\n"));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }
            }

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
            /* Check for bits that must remain set or cleared in VMX operation,
               see Intel spec. 23.8 "Restrictions on VMX operation". */
            if (IEM_VMX_IS_ROOT_MODE(pVCpu))
            {
                uint64_t const uCr0Fixed0 = iemVmxGetCr0Fixed0(pVCpu, IEM_VMX_IS_NON_ROOT_MODE(pVCpu));
                if ((uNewCrX & uCr0Fixed0) != uCr0Fixed0)
                {
                    Log(("Trying to clear reserved CR0 bits in VMX operation: NewCr0=%#llx MB1=%#llx\n", uNewCrX, uCr0Fixed0));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }

                uint64_t const uCr0Fixed1 = pVCpu->cpum.GstCtx.hwvirt.vmx.Msrs.u64Cr0Fixed1;
                if (uNewCrX & ~uCr0Fixed1)
                {
                    Log(("Trying to set reserved CR0 bits in VMX operation: NewCr0=%#llx MB0=%#llx\n", uNewCrX, uCr0Fixed1));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }
            }
#endif

            /*
             * SVM nested-guest CR0 write intercepts.
             */
            if (IEM_SVM_IS_WRITE_CR_INTERCEPT_SET(pVCpu, iCrReg))
            {
                Log(("iemCImpl_load_Cr%#x: Guest intercept -> #VMEXIT\n", iCrReg));
                IEM_SVM_UPDATE_NRIP(pVCpu);
                IEM_SVM_CRX_VMEXIT_RET(pVCpu, SVM_EXIT_WRITE_CR0, enmAccessCrX, iGReg);
            }
            if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_CR0_SEL_WRITE))
            {
                /* 'lmsw' intercepts regardless of whether the TS/MP bits are actually toggled. */
                if (   enmAccessCrX == IEMACCESSCRX_LMSW
                    || (uNewCrX & ~(X86_CR0_TS | X86_CR0_MP)) != (uOldCrX & ~(X86_CR0_TS | X86_CR0_MP)))
                {
                    Assert(enmAccessCrX != IEMACCESSCRX_CLTS);
                    Log(("iemCImpl_load_Cr%#x: lmsw or bits other than TS/MP changed: Guest intercept -> #VMEXIT\n", iCrReg));
                    IEM_SVM_UPDATE_NRIP(pVCpu);
                    IEM_SVM_CRX_VMEXIT_RET(pVCpu, SVM_EXIT_CR0_SEL_WRITE, enmAccessCrX, iGReg);
                }
            }

            /*
             * Change EFER.LMA if entering or leaving long mode.
             */
            uint64_t NewEFER = pVCpu->cpum.GstCtx.msrEFER;
            if (   (uNewCrX & X86_CR0_PG) != (uOldCrX & X86_CR0_PG)
                && (pVCpu->cpum.GstCtx.msrEFER & MSR_K6_EFER_LME) )
            {
                if (uNewCrX & X86_CR0_PG)
                    NewEFER |= MSR_K6_EFER_LMA;
                else
                    NewEFER &= ~MSR_K6_EFER_LMA;

                CPUMSetGuestEFER(pVCpu, NewEFER);
                Assert(pVCpu->cpum.GstCtx.msrEFER == NewEFER);
            }

            /*
             * Inform PGM.
             */
            if (    (uNewCrX & (X86_CR0_PG | X86_CR0_WP | X86_CR0_PE | X86_CR0_CD | X86_CR0_NW))
                !=  (uOldCrX & (X86_CR0_PG | X86_CR0_WP | X86_CR0_PE | X86_CR0_CD | X86_CR0_NW)) )
            {
                if (    enmAccessCrX != IEMACCESSCRX_MOV_CRX
                    || !CPUMIsPaePagingEnabled(uNewCrX, pVCpu->cpum.GstCtx.cr4, NewEFER)
                    ||  CPUMIsGuestInSvmNestedHwVirtMode(IEM_GET_CTX(pVCpu)))
                { /* likely */ }
                else
                    IEM_MAP_PAE_PDPES_AT_CR3_RET(pVCpu, iCrReg, pVCpu->cpum.GstCtx.cr3);
                rc = PGMFlushTLB(pVCpu, pVCpu->cpum.GstCtx.cr3, true /* global */);
                AssertRCReturn(rc, rc);
                /* ignore informational status codes */
            }

            /*
             * Change CR0.
             */
            CPUMSetGuestCR0(pVCpu, uNewCrX);
            Assert(pVCpu->cpum.GstCtx.cr0 == uNewCrX);

            rcStrict = PGMChangeMode(pVCpu, pVCpu->cpum.GstCtx.cr0, pVCpu->cpum.GstCtx.cr4, pVCpu->cpum.GstCtx.msrEFER,
                                     false /* fForce */);
            break;
        }

        /*
         * CR2 can be changed without any restrictions.
         */
        case 2:
        {
            if (IEM_SVM_IS_WRITE_CR_INTERCEPT_SET(pVCpu, /*cr*/ 2))
            {
                Log(("iemCImpl_load_Cr%#x: Guest intercept -> #VMEXIT\n", iCrReg));
                IEM_SVM_UPDATE_NRIP(pVCpu);
                IEM_SVM_CRX_VMEXIT_RET(pVCpu, SVM_EXIT_WRITE_CR2, enmAccessCrX, iGReg);
            }
            pVCpu->cpum.GstCtx.cr2 = uNewCrX;
            pVCpu->cpum.GstCtx.fExtrn &= ~CPUMCTX_EXTRN_CR2;
            rcStrict  = VINF_SUCCESS;
            break;
        }

        /*
         * CR3 is relatively simple, although AMD and Intel have different
         * accounts of how setting reserved bits are handled.  We take intel's
         * word for the lower bits and AMD's for the high bits (63:52).  The
         * lower reserved bits are ignored and left alone; OpenBSD 5.8 relies
         * on this.
         */
        /** @todo Testcase: Setting reserved bits in CR3, especially before
         *        enabling paging. */
        case 3:
        {
            IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR3);

            /* Bit 63 being clear in the source operand with PCIDE indicates no invalidations are required. */
            if (   (pVCpu->cpum.GstCtx.cr4 & X86_CR4_PCIDE)
                && (uNewCrX & RT_BIT_64(63)))
            {
                /** @todo r=ramshankar: avoiding a TLB flush altogether here causes Windows 10
                 *        SMP(w/o nested-paging) to hang during bootup on Skylake systems, see
                 *        Intel spec. 4.10.4.1 "Operations that Invalidate TLBs and
                 *        Paging-Structure Caches". */
                uNewCrX &= ~RT_BIT_64(63);
            }

            /* Check / mask the value. */
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX_EPT
            /* See Intel spec. 27.2.2 "EPT Translation Mechanism" footnote. */
            uint64_t const fInvPhysMask = !CPUMIsGuestVmxEptPagingEnabledEx(IEM_GET_CTX(pVCpu))
                                        ? (UINT64_MAX << IEM_GET_GUEST_CPU_FEATURES(pVCpu)->cMaxPhysAddrWidth)
                                        : (~X86_CR3_EPT_PAGE_MASK & X86_PAGE_4K_BASE_MASK);
#else
            uint64_t const fInvPhysMask = UINT64_C(0xfff0000000000000);
#endif
            if (uNewCrX & fInvPhysMask)
            {
                /** @todo Should we raise this only for 64-bit mode like Intel claims? AMD is
                 *        very vague in this area. As mentioned above, need testcase on real
                 *        hardware... Sigh. */
                Log(("Trying to load CR3 with invalid high bits set: %#llx\n", uNewCrX));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }

            uint64_t fValid;
            if (   (pVCpu->cpum.GstCtx.cr4 & X86_CR4_PAE)
                && (pVCpu->cpum.GstCtx.msrEFER & MSR_K6_EFER_LME))
            {
                /** @todo Redundant? This value has already been validated above. */
                fValid = UINT64_C(0x000fffffffffffff);
            }
            else
                fValid = UINT64_C(0xffffffff);
            if (uNewCrX & ~fValid)
            {
                Log(("Automatically clearing reserved MBZ bits in CR3 load: NewCR3=%#llx ClearedBits=%#llx\n",
                     uNewCrX, uNewCrX & ~fValid));
                uNewCrX &= fValid;
            }

            if (IEM_SVM_IS_WRITE_CR_INTERCEPT_SET(pVCpu, /*cr*/ 3))
            {
                Log(("iemCImpl_load_Cr%#x: Guest intercept -> #VMEXIT\n", iCrReg));
                IEM_SVM_UPDATE_NRIP(pVCpu);
                IEM_SVM_CRX_VMEXIT_RET(pVCpu, SVM_EXIT_WRITE_CR3, enmAccessCrX, iGReg);
            }

            /* Inform PGM. */
            if (pVCpu->cpum.GstCtx.cr0 & X86_CR0_PG)
            {
                if (   !CPUMIsGuestInPAEModeEx(IEM_GET_CTX(pVCpu))
                    ||  CPUMIsGuestInSvmNestedHwVirtMode(IEM_GET_CTX(pVCpu)))
                { /* likely */ }
                else
                {
                    Assert(enmAccessCrX == IEMACCESSCRX_MOV_CRX);
                    IEM_MAP_PAE_PDPES_AT_CR3_RET(pVCpu, iCrReg, uNewCrX);
                }
                rc = PGMFlushTLB(pVCpu, uNewCrX, !(pVCpu->cpum.GstCtx.cr4 & X86_CR4_PGE));
                AssertRCReturn(rc, rc);
                /* ignore informational status codes */
            }

            /* Make the change. */
            rc = CPUMSetGuestCR3(pVCpu, uNewCrX);
            AssertRCSuccessReturn(rc, rc);

            rcStrict = VINF_SUCCESS;
            break;
        }

        /*
         * CR4 is a bit more tedious as there are bits which cannot be cleared
         * under some circumstances and such.
         */
        case 4:
        {
            IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR4);
            uint64_t const uOldCrX = pVCpu->cpum.GstCtx.cr4;

            /* Reserved bits. */
            uint32_t const fValid = CPUMGetGuestCR4ValidMask(pVCpu->CTX_SUFF(pVM));
            if (uNewCrX & ~(uint64_t)fValid)
            {
                Log(("Trying to set reserved CR4 bits: NewCR4=%#llx InvalidBits=%#llx\n", uNewCrX, uNewCrX & ~(uint64_t)fValid));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }

            bool const fPcide    = !(uOldCrX & X86_CR4_PCIDE) && (uNewCrX & X86_CR4_PCIDE);
            bool const fLongMode = CPUMIsGuestInLongModeEx(IEM_GET_CTX(pVCpu));

            /* PCIDE check. */
            if (   fPcide
                && (   !fLongMode
                    || (pVCpu->cpum.GstCtx.cr3 & UINT64_C(0xfff))))
            {
                Log(("Trying to set PCIDE with invalid PCID or outside long mode. Pcid=%#x\n", (pVCpu->cpum.GstCtx.cr3 & UINT64_C(0xfff))));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }

            /* PAE check. */
            if (   fLongMode
                && (uOldCrX & X86_CR4_PAE)
                && !(uNewCrX & X86_CR4_PAE))
            {
                Log(("Trying to set clear CR4.PAE while long mode is active\n"));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }

            if (IEM_SVM_IS_WRITE_CR_INTERCEPT_SET(pVCpu, /*cr*/ 4))
            {
                Log(("iemCImpl_load_Cr%#x: Guest intercept -> #VMEXIT\n", iCrReg));
                IEM_SVM_UPDATE_NRIP(pVCpu);
                IEM_SVM_CRX_VMEXIT_RET(pVCpu, SVM_EXIT_WRITE_CR4, enmAccessCrX, iGReg);
            }

            /* Check for bits that must remain set or cleared in VMX operation,
               see Intel spec. 23.8 "Restrictions on VMX operation". */
            if (IEM_VMX_IS_ROOT_MODE(pVCpu))
            {
                uint64_t const uCr4Fixed0 = pVCpu->cpum.GstCtx.hwvirt.vmx.Msrs.u64Cr4Fixed0;
                if ((uNewCrX & uCr4Fixed0) != uCr4Fixed0)
                {
                    Log(("Trying to clear reserved CR4 bits in VMX operation: NewCr4=%#llx MB1=%#llx\n", uNewCrX, uCr4Fixed0));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }

                uint64_t const uCr4Fixed1 = pVCpu->cpum.GstCtx.hwvirt.vmx.Msrs.u64Cr4Fixed1;
                if (uNewCrX & ~uCr4Fixed1)
                {
                    Log(("Trying to set reserved CR4 bits in VMX operation: NewCr4=%#llx MB0=%#llx\n", uNewCrX, uCr4Fixed1));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }
            }

            /*
             * Notify PGM.
             */
            if ((uNewCrX ^ uOldCrX) & (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_PCIDE /* | X86_CR4_SMEP */))
            {
                if (   !CPUMIsPaePagingEnabled(pVCpu->cpum.GstCtx.cr0, uNewCrX, pVCpu->cpum.GstCtx.msrEFER)
                    || CPUMIsGuestInSvmNestedHwVirtMode(IEM_GET_CTX(pVCpu)))
                { /* likely */ }
                else
                {
                    Assert(enmAccessCrX == IEMACCESSCRX_MOV_CRX);
                    IEM_MAP_PAE_PDPES_AT_CR3_RET(pVCpu, iCrReg, pVCpu->cpum.GstCtx.cr3);
                }
                rc = PGMFlushTLB(pVCpu, pVCpu->cpum.GstCtx.cr3, true /* global */);
                AssertRCReturn(rc, rc);
                /* ignore informational status codes */
            }

            /*
             * Change it.
             */
            rc = CPUMSetGuestCR4(pVCpu, uNewCrX);
            AssertRCSuccessReturn(rc, rc);
            Assert(pVCpu->cpum.GstCtx.cr4 == uNewCrX);

            rcStrict = PGMChangeMode(pVCpu, pVCpu->cpum.GstCtx.cr0, pVCpu->cpum.GstCtx.cr4, pVCpu->cpum.GstCtx.msrEFER,
                                     false /* fForce */);
            break;
        }

        /*
         * CR8 maps to the APIC TPR.
         */
        case 8:
        {
            IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_APIC_TPR);
            if (uNewCrX & ~(uint64_t)0xf)
            {
                Log(("Trying to set reserved CR8 bits (%#RX64)\n", uNewCrX));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
            if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
                && IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_USE_TPR_SHADOW))
            {
                /*
                 * If the Mov-to-CR8 doesn't cause a VM-exit, bits 0:3 of the source operand
                 * is copied to bits 7:4 of the VTPR. Bits 0:3 and bits 31:8 of the VTPR are
                 * cleared. Following this the processor performs TPR virtualization.
                 *
                 * However, we should not perform TPR virtualization immediately here but
                 * after this instruction has completed.
                 *
                 * See Intel spec. 29.3 "Virtualizing CR8-based TPR Accesses"
                 * See Intel spec. 27.1 "Architectural State Before A VM-exit"
                 */
                uint32_t const uTpr = (uNewCrX & 0xf) << 4;
                Log(("iemCImpl_load_Cr%#x: Virtualizing TPR (%#x) write\n", iCrReg, uTpr));
                iemVmxVirtApicWriteRaw32(pVCpu, XAPIC_OFF_TPR, uTpr);
                iemVmxVirtApicSetPendingWrite(pVCpu, XAPIC_OFF_TPR);
                rcStrict = VINF_SUCCESS;
                break;
            }
#endif

#ifdef VBOX_WITH_NESTED_HWVIRT_SVM
            if (CPUMIsGuestInSvmNestedHwVirtMode(IEM_GET_CTX(pVCpu)))
            {
                if (IEM_SVM_IS_WRITE_CR_INTERCEPT_SET(pVCpu, /*cr*/ 8))
                {
                    Log(("iemCImpl_load_Cr%#x: Guest intercept -> #VMEXIT\n", iCrReg));
                    IEM_SVM_UPDATE_NRIP(pVCpu);
                    IEM_SVM_CRX_VMEXIT_RET(pVCpu, SVM_EXIT_WRITE_CR8, enmAccessCrX, iGReg);
                }

                pVCpu->cpum.GstCtx.hwvirt.svm.Vmcb.ctrl.IntCtrl.n.u8VTPR = uNewCrX;
                if (CPUMIsGuestSvmVirtIntrMasking(pVCpu, IEM_GET_CTX(pVCpu)))
                {
                    rcStrict = VINF_SUCCESS;
                    break;
                }
            }
#endif
            uint8_t const u8Tpr = (uint8_t)uNewCrX << 4;
            APICSetTpr(pVCpu, u8Tpr);
            rcStrict = VINF_SUCCESS;
            break;
        }

        IEM_NOT_REACHED_DEFAULT_CASE_RET(); /* call checks */
    }

    /*
     * Advance the RIP on success.
     */
    if (RT_SUCCESS(rcStrict))
    {
        if (rcStrict != VINF_SUCCESS)
            iemSetPassUpStatus(pVCpu, rcStrict);
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }

    return rcStrict;
}


/**
 * Implements mov CRx,GReg.
 *
 * @param   iCrReg          The CRx register to write (valid).
 * @param   iGReg           The general register to load the CRx value from.
 */
IEM_CIMPL_DEF_2(iemCImpl_mov_Cd_Rd, uint8_t, iCrReg, uint8_t, iGReg)
{
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);

    /*
     * Read the new value from the source register and call common worker.
     */
    uint64_t uNewCrX;
    if (pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT)
        uNewCrX = iemGRegFetchU64(pVCpu, iGReg);
    else
        uNewCrX = iemGRegFetchU32(pVCpu, iGReg);

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        VBOXSTRICTRC rcStrict = VINF_VMX_INTERCEPT_NOT_ACTIVE;
        switch (iCrReg)
        {
            case 0:
            case 4: rcStrict = iemVmxVmexitInstrMovToCr0Cr4(pVCpu, iCrReg, &uNewCrX, iGReg, cbInstr);   break;
            case 3: rcStrict = iemVmxVmexitInstrMovToCr3(pVCpu, uNewCrX, iGReg, cbInstr);               break;
            case 8: rcStrict = iemVmxVmexitInstrMovToCr8(pVCpu, iGReg, cbInstr);                        break;
        }
        if (rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE)
            return rcStrict;
    }
#endif

    return IEM_CIMPL_CALL_4(iemCImpl_load_CrX, iCrReg, uNewCrX, IEMACCESSCRX_MOV_CRX, iGReg);
}


/**
 * Implements 'LMSW r/m16'
 *
 * @param   u16NewMsw       The new value.
 * @param   GCPtrEffDst     The guest-linear address of the source operand in case
 *                          of a memory operand. For register operand, pass
 *                          NIL_RTGCPTR.
 */
IEM_CIMPL_DEF_2(iemCImpl_lmsw, uint16_t, u16NewMsw, RTGCPTR, GCPtrEffDst)
{
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0);

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    /* Check nested-guest VMX intercept and get updated MSW if there's no VM-exit. */
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        VBOXSTRICTRC rcStrict = iemVmxVmexitInstrLmsw(pVCpu, pVCpu->cpum.GstCtx.cr0, &u16NewMsw, GCPtrEffDst, cbInstr);
        if (rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE)
            return rcStrict;
    }
#else
    RT_NOREF_PV(GCPtrEffDst);
#endif

    /*
     * Compose the new CR0 value and call common worker.
     */
    uint64_t uNewCr0 = pVCpu->cpum.GstCtx.cr0  & ~(X86_CR0_MP | X86_CR0_EM | X86_CR0_TS);
    uNewCr0 |= u16NewMsw & (X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS);
    return IEM_CIMPL_CALL_4(iemCImpl_load_CrX, /*cr*/ 0, uNewCr0, IEMACCESSCRX_LMSW, UINT8_MAX /* iGReg */);
}


/**
 * Implements 'CLTS'.
 */
IEM_CIMPL_DEF_0(iemCImpl_clts)
{
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);

    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0);
    uint64_t uNewCr0 = pVCpu->cpum.GstCtx.cr0;
    uNewCr0 &= ~X86_CR0_TS;

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        VBOXSTRICTRC rcStrict = iemVmxVmexitInstrClts(pVCpu, cbInstr);
        if (rcStrict == VINF_VMX_MODIFIES_BEHAVIOR)
            uNewCr0 |= (pVCpu->cpum.GstCtx.cr0 & X86_CR0_TS);
        else if (rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE)
            return rcStrict;
    }
#endif

    return IEM_CIMPL_CALL_4(iemCImpl_load_CrX, /*cr*/ 0, uNewCr0, IEMACCESSCRX_CLTS, UINT8_MAX /* iGReg */);
}


/**
 * Implements mov GReg,DRx.
 *
 * @param   iGReg           The general register to store the DRx value in.
 * @param   iDrReg          The DRx register to read (0-7).
 */
IEM_CIMPL_DEF_2(iemCImpl_mov_Rd_Dd, uint8_t, iGReg, uint8_t, iDrReg)
{
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    /*
     * Check nested-guest VMX intercept.
     * Unlike most other intercepts, the Mov DRx intercept takes preceedence
     * over CPL and CR4.DE and even DR4/DR5 checks.
     *
     * See Intel spec. 25.1.3 "Instructions That Cause VM Exits Conditionally".
     */
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        VBOXSTRICTRC rcStrict = iemVmxVmexitInstrMovDrX(pVCpu, VMXINSTRID_MOV_FROM_DRX, iDrReg, iGReg, cbInstr);
        if (rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE)
            return rcStrict;
    }
#endif

    /*
     * Check preconditions.
     */
    /* Raise GPs. */
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_DR7);

    /** @todo \#UD in outside ring-0 too? */
    if (iDrReg == 4 || iDrReg == 5)
    {
        IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_CR4);
        if (pVCpu->cpum.GstCtx.cr4 & X86_CR4_DE)
        {
            Log(("mov r%u,dr%u: CR4.DE=1 -> #GP(0)\n", iGReg, iDrReg));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }
        iDrReg += 2;
    }

    /* Raise #DB if general access detect is enabled. */
    if (pVCpu->cpum.GstCtx.dr[7] & X86_DR7_GD)
    {
        Log(("mov r%u,dr%u: DR7.GD=1 -> #DB\n", iGReg, iDrReg));
        return iemRaiseDebugException(pVCpu);
    }

    /*
     * Read the debug register and store it in the specified general register.
     */
    uint64_t drX;
    switch (iDrReg)
    {
        case 0:
            IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_DR0_DR3);
            drX = pVCpu->cpum.GstCtx.dr[0];
            break;
        case 1:
            IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_DR0_DR3);
            drX = pVCpu->cpum.GstCtx.dr[1];
            break;
        case 2:
            IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_DR0_DR3);
            drX = pVCpu->cpum.GstCtx.dr[2];
            break;
        case 3:
            IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_DR0_DR3);
            drX = pVCpu->cpum.GstCtx.dr[3];
            break;
        case 6:
            IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_DR6);
            drX = pVCpu->cpum.GstCtx.dr[6];
            drX |= X86_DR6_RA1_MASK;
            drX &= ~X86_DR6_RAZ_MASK;
            break;
        case 7:
            IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_DR7);
            drX = pVCpu->cpum.GstCtx.dr[7];
            drX |=X86_DR7_RA1_MASK;
            drX &= ~X86_DR7_RAZ_MASK;
            break;
        IEM_NOT_REACHED_DEFAULT_CASE_RET(); /* caller checks */
    }

    /** @todo SVM nested-guest intercept for DR8-DR15? */
    /*
     * Check for any SVM nested-guest intercepts for the DRx read.
     */
    if (IEM_SVM_IS_READ_DR_INTERCEPT_SET(pVCpu, iDrReg))
    {
        Log(("mov r%u,dr%u: Guest intercept -> #VMEXIT\n", iGReg, iDrReg));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_READ_DR0 + (iDrReg & 0xf),
                           IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fSvmDecodeAssists ? (iGReg & 7) : 0, 0 /* uExitInfo2 */);
    }

    if (pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT)
        *(uint64_t *)iemGRegRef(pVCpu, iGReg) = drX;
    else
        *(uint64_t *)iemGRegRef(pVCpu, iGReg) = (uint32_t)drX;

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements mov DRx,GReg.
 *
 * @param   iDrReg          The DRx register to write (valid).
 * @param   iGReg           The general register to load the DRx value from.
 */
IEM_CIMPL_DEF_2(iemCImpl_mov_Dd_Rd, uint8_t, iDrReg, uint8_t, iGReg)
{
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    /*
     * Check nested-guest VMX intercept.
     * Unlike most other intercepts, the Mov DRx intercept takes preceedence
     * over CPL and CR4.DE and even DR4/DR5 checks.
     *
     * See Intel spec. 25.1.3 "Instructions That Cause VM Exits Conditionally".
     */
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        VBOXSTRICTRC rcStrict = iemVmxVmexitInstrMovDrX(pVCpu, VMXINSTRID_MOV_TO_DRX, iDrReg, iGReg, cbInstr);
        if (rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE)
            return rcStrict;
    }
#endif

    /*
     * Check preconditions.
     */
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_DR7);

    if (iDrReg == 4 || iDrReg == 5)
    {
        IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_CR4);
        if (pVCpu->cpum.GstCtx.cr4 & X86_CR4_DE)
        {
            Log(("mov dr%u,r%u: CR4.DE=1 -> #GP(0)\n", iDrReg, iGReg));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }
        iDrReg += 2;
    }

    /* Raise #DB if general access detect is enabled. */
    /** @todo is \#DB/DR7.GD raised before any reserved high bits in DR7/DR6
     *        \#GP? */
    if (pVCpu->cpum.GstCtx.dr[7] & X86_DR7_GD)
    {
        Log(("mov dr%u,r%u: DR7.GD=1 -> #DB\n", iDrReg, iGReg));
        return iemRaiseDebugException(pVCpu);
    }

    /*
     * Read the new value from the source register.
     */
    uint64_t uNewDrX;
    if (pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT)
        uNewDrX = iemGRegFetchU64(pVCpu, iGReg);
    else
        uNewDrX = iemGRegFetchU32(pVCpu, iGReg);

    /*
     * Adjust it.
     */
    switch (iDrReg)
    {
        case 0:
        case 1:
        case 2:
        case 3:
            /* nothing to adjust */
            break;

        case 6:
            if (uNewDrX & X86_DR6_MBZ_MASK)
            {
                Log(("mov dr%u,%#llx: DR6 high bits are not zero -> #GP(0)\n", iDrReg, uNewDrX));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }
            uNewDrX |= X86_DR6_RA1_MASK;
            uNewDrX &= ~X86_DR6_RAZ_MASK;
            break;

        case 7:
            if (uNewDrX & X86_DR7_MBZ_MASK)
            {
                Log(("mov dr%u,%#llx: DR7 high bits are not zero -> #GP(0)\n", iDrReg, uNewDrX));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }
            uNewDrX |= X86_DR7_RA1_MASK;
            uNewDrX &= ~X86_DR7_RAZ_MASK;
            break;

        IEM_NOT_REACHED_DEFAULT_CASE_RET();
    }

    /** @todo SVM nested-guest intercept for DR8-DR15? */
    /*
     * Check for any SVM nested-guest intercepts for the DRx write.
     */
    if (IEM_SVM_IS_WRITE_DR_INTERCEPT_SET(pVCpu, iDrReg))
    {
        Log2(("mov dr%u,r%u: Guest intercept -> #VMEXIT\n", iDrReg, iGReg));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_WRITE_DR0 + (iDrReg & 0xf),
                              IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fSvmDecodeAssists ? (iGReg & 7) : 0, 0 /* uExitInfo2 */);
    }

    /*
     * Do the actual setting.
     */
    if (iDrReg < 4)
        IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_DR0_DR3);
    else if (iDrReg == 6)
        IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_DR6);

    int rc = CPUMSetGuestDRx(pVCpu, iDrReg, uNewDrX);
    AssertRCSuccessReturn(rc, RT_SUCCESS_NP(rc) ? VERR_IEM_IPE_1 : rc);

    /*
     * Re-init hardware breakpoint summary if it was DR7 that got changed.
     */
    if (iDrReg == 7)
    {
        pVCpu->iem.s.fPendingInstructionBreakpoints = false;
        pVCpu->iem.s.fPendingDataBreakpoints        = false;
        pVCpu->iem.s.fPendingIoBreakpoints          = false;
        iemInitPendingBreakpointsSlow(pVCpu);
    }

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements mov GReg,TRx.
 *
 * @param   iGReg           The general register to store the
 *                          TRx value in.
 * @param   iTrReg          The TRx register to read (6/7).
 */
IEM_CIMPL_DEF_2(iemCImpl_mov_Rd_Td, uint8_t, iGReg, uint8_t, iTrReg)
{
    /*
     * Check preconditions. NB: This instruction is 386/486 only.
     */

    /* Raise GPs. */
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);

    if (iTrReg < 6 || iTrReg > 7)
    {
        /** @todo Do Intel CPUs reject this or are the TRs aliased? */
        Log(("mov r%u,tr%u: invalid register -> #GP(0)\n", iGReg, iTrReg));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /*
     * Read the test register and store it in the specified general register.
     * This is currently a dummy implementation that only exists to satisfy
     * old debuggers like WDEB386 or OS/2 KDB which unconditionally read the
     * TR6/TR7 registers. Software which actually depends on the TR values
     * (different on 386/486) is exceedingly rare.
     */
    uint64_t trX;
    switch (iTrReg)
    {
        case 6:
            trX = 0;    /* Currently a dummy. */
            break;
        case 7:
            trX = 0;    /* Currently a dummy. */
            break;
        IEM_NOT_REACHED_DEFAULT_CASE_RET(); /* call checks */
    }

    *(uint64_t *)iemGRegRef(pVCpu, iGReg) = (uint32_t)trX;

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements mov TRx,GReg.
 *
 * @param   iTrReg          The TRx register to write (valid).
 * @param   iGReg           The general register to load the TRx
 *                          value from.
 */
IEM_CIMPL_DEF_2(iemCImpl_mov_Td_Rd, uint8_t, iTrReg, uint8_t, iGReg)
{
    /*
     * Check preconditions. NB: This instruction is 386/486 only.
     */

    /* Raise GPs. */
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);

    if (iTrReg < 6 || iTrReg > 7)
    {
        /** @todo Do Intel CPUs reject this or are the TRs aliased? */
        Log(("mov r%u,tr%u: invalid register -> #GP(0)\n", iGReg, iTrReg));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /*
     * Read the new value from the source register.
     */
    uint64_t uNewTrX;
    if (pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT)
        uNewTrX = iemGRegFetchU64(pVCpu, iGReg);
    else
        uNewTrX = iemGRegFetchU32(pVCpu, iGReg);

    /*
     * Here we would do the actual setting if this weren't a dummy implementation.
     * This is currently a dummy implementation that only exists to prevent
     * old debuggers like WDEB386 or OS/2 KDB from crashing.
     */
    RT_NOREF(uNewTrX);

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'INVLPG m'.
 *
 * @param   GCPtrPage       The effective address of the page to invalidate.
 * @remarks Updates the RIP.
 */
IEM_CIMPL_DEF_1(iemCImpl_invlpg, RTGCPTR, GCPtrPage)
{
    /* ring-0 only. */
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);
    Assert(!pVCpu->cpum.GstCtx.eflags.Bits.u1VM);
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_CR3 | CPUMCTX_EXTRN_CR4 | CPUMCTX_EXTRN_EFER);

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_INVLPG_EXIT))
    {
        Log(("invlpg: Guest intercept (%RGp) -> VM-exit\n", GCPtrPage));
        return iemVmxVmexitInstrInvlpg(pVCpu, GCPtrPage, cbInstr);
    }
#endif

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_INVLPG))
    {
        Log(("invlpg: Guest intercept (%RGp) -> #VMEXIT\n", GCPtrPage));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_INVLPG,
                              IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fSvmDecodeAssists ? GCPtrPage : 0, 0 /* uExitInfo2 */);
    }

    int rc = PGMInvalidatePage(pVCpu, GCPtrPage);
    if (rc == VINF_SUCCESS)
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    if (rc == VINF_PGM_SYNC_CR3)
    {
        iemSetPassUpStatus(pVCpu, rc);
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }

    AssertMsg(RT_FAILURE_NP(rc), ("%Rrc\n", rc));
    Log(("PGMInvalidatePage(%RGv) -> %Rrc\n", GCPtrPage, rc));
    return rc;
}


/**
 * Implements INVPCID.
 *
 * @param   iEffSeg              The segment of the invpcid descriptor.
 * @param   GCPtrInvpcidDesc     The address of invpcid descriptor.
 * @param   uInvpcidType         The invalidation type.
 * @remarks Updates the RIP.
 */
IEM_CIMPL_DEF_3(iemCImpl_invpcid, uint8_t, iEffSeg, RTGCPTR, GCPtrInvpcidDesc, uint64_t, uInvpcidType)
{
    /*
     * Check preconditions.
     */
    if (!IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fInvpcid)
        return iemRaiseUndefinedOpcode(pVCpu);

    /* When in VMX non-root mode and INVPCID is not enabled, it results in #UD. */
    if (    IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && !IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_INVPCID))
    {
        Log(("invpcid: Not enabled for nested-guest execution -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }

    if (pVCpu->iem.s.uCpl != 0)
    {
        Log(("invpcid: CPL != 0 -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    if (IEM_IS_V86_MODE(pVCpu))
    {
        Log(("invpcid: v8086 mode -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /*
     * Check nested-guest intercept.
     *
     * INVPCID causes a VM-exit if "enable INVPCID" and "INVLPG exiting" are
     * both set. We have already checked the former earlier in this function.
     *
     * CPL and virtual-8086 mode checks take priority over this VM-exit.
     * See Intel spec. "25.1.1 Relative Priority of Faults and VM Exits".
     */
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_INVLPG_EXIT))
    {
        Log(("invpcid: Guest intercept -> #VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_NEEDS_INFO_RET(pVCpu, VMX_EXIT_INVPCID, VMXINSTRID_NONE, cbInstr);
    }

    if (uInvpcidType > X86_INVPCID_TYPE_MAX_VALID)
    {
        Log(("invpcid: invalid/unrecognized invpcid type %#RX64 -> #GP(0)\n", uInvpcidType));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_CR3 | CPUMCTX_EXTRN_CR4 | CPUMCTX_EXTRN_EFER);

    /*
     * Fetch the invpcid descriptor from guest memory.
     */
    RTUINT128U uDesc;
    VBOXSTRICTRC rcStrict = iemMemFetchDataU128(pVCpu, &uDesc, iEffSeg, GCPtrInvpcidDesc);
    if (rcStrict == VINF_SUCCESS)
    {
        /*
         * Validate the descriptor.
         */
        if (uDesc.s.Lo > 0xfff)
        {
            Log(("invpcid: reserved bits set in invpcid descriptor %#RX64 -> #GP(0)\n", uDesc.s.Lo));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }

        RTGCUINTPTR64 const GCPtrInvAddr = uDesc.s.Hi;
        uint8_t       const uPcid        = uDesc.s.Lo & UINT64_C(0xfff);
        uint32_t      const uCr4         = pVCpu->cpum.GstCtx.cr4;
        uint64_t      const uCr3         = pVCpu->cpum.GstCtx.cr3;
        switch (uInvpcidType)
        {
            case X86_INVPCID_TYPE_INDV_ADDR:
            {
                if (!IEM_IS_CANONICAL(GCPtrInvAddr))
                {
                    Log(("invpcid: invalidation address %#RGP is not canonical -> #GP(0)\n", GCPtrInvAddr));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }
                if (  !(uCr4 & X86_CR4_PCIDE)
                    && uPcid != 0)
                {
                    Log(("invpcid: invalid pcid %#x\n", uPcid));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }

                /* Invalidate mappings for the linear address tagged with PCID except global translations. */
                PGMFlushTLB(pVCpu, uCr3, false /* fGlobal */);
                break;
            }

            case X86_INVPCID_TYPE_SINGLE_CONTEXT:
            {
                if (  !(uCr4 & X86_CR4_PCIDE)
                    && uPcid != 0)
                {
                    Log(("invpcid: invalid pcid %#x\n", uPcid));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }
                /* Invalidate all mappings associated with PCID except global translations. */
                PGMFlushTLB(pVCpu, uCr3, false /* fGlobal */);
                break;
            }

            case X86_INVPCID_TYPE_ALL_CONTEXT_INCL_GLOBAL:
            {
                PGMFlushTLB(pVCpu, uCr3, true /* fGlobal */);
                break;
            }

            case X86_INVPCID_TYPE_ALL_CONTEXT_EXCL_GLOBAL:
            {
                PGMFlushTLB(pVCpu, uCr3, false /* fGlobal */);
                break;
            }
            IEM_NOT_REACHED_DEFAULT_CASE_RET();
        }
        rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }
    return rcStrict;
}


/**
 * Implements INVD.
 */
IEM_CIMPL_DEF_0(iemCImpl_invd)
{
    if (pVCpu->iem.s.uCpl != 0)
    {
        Log(("invd: CPL != 0 -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
        IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_INVD, cbInstr);

    IEM_SVM_CHECK_INSTR_INTERCEPT(pVCpu, SVM_CTRL_INTERCEPT_INVD, SVM_EXIT_INVD, 0, 0);

    /* We currently take no action here. */
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements WBINVD.
 */
IEM_CIMPL_DEF_0(iemCImpl_wbinvd)
{
    if (pVCpu->iem.s.uCpl != 0)
    {
        Log(("wbinvd: CPL != 0 -> #GP(0)\n"));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
        IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_WBINVD, cbInstr);

    IEM_SVM_CHECK_INSTR_INTERCEPT(pVCpu, SVM_CTRL_INTERCEPT_WBINVD, SVM_EXIT_WBINVD, 0, 0);

    /* We currently take no action here. */
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/** Opcode 0x0f 0xaa. */
IEM_CIMPL_DEF_0(iemCImpl_rsm)
{
    IEM_SVM_CHECK_INSTR_INTERCEPT(pVCpu, SVM_CTRL_INTERCEPT_RSM, SVM_EXIT_RSM, 0, 0);
    NOREF(cbInstr);
    return iemRaiseUndefinedOpcode(pVCpu);
}


/**
 * Implements RDTSC.
 */
IEM_CIMPL_DEF_0(iemCImpl_rdtsc)
{
    /*
     * Check preconditions.
     */
    if (!IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fTsc)
        return iemRaiseUndefinedOpcode(pVCpu);

    if (pVCpu->iem.s.uCpl != 0)
    {
        IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR4);
        if (pVCpu->cpum.GstCtx.cr4 & X86_CR4_TSD)
        {
            Log(("rdtsc: CR4.TSD and CPL=%u -> #GP(0)\n", pVCpu->iem.s.uCpl));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }
    }

    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_RDTSC_EXIT))
    {
        Log(("rdtsc: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_RDTSC, cbInstr);
    }

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_RDTSC))
    {
        Log(("rdtsc: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_RDTSC, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * Do the job.
     */
    uint64_t uTicks = TMCpuTickGet(pVCpu);
#if defined(VBOX_WITH_NESTED_HWVIRT_SVM) || defined(VBOX_WITH_NESTED_HWVIRT_VMX)
    uTicks = CPUMApplyNestedGuestTscOffset(pVCpu, uTicks);
#endif
    pVCpu->cpum.GstCtx.rax = RT_LO_U32(uTicks);
    pVCpu->cpum.GstCtx.rdx = RT_HI_U32(uTicks);
    pVCpu->cpum.GstCtx.fExtrn &= ~(CPUMCTX_EXTRN_RAX | CPUMCTX_EXTRN_RDX); /* For IEMExecDecodedRdtsc. */
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements RDTSC.
 */
IEM_CIMPL_DEF_0(iemCImpl_rdtscp)
{
    /*
     * Check preconditions.
     */
    if (!IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fRdTscP)
        return iemRaiseUndefinedOpcode(pVCpu);

    if (    IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && !IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_RDTSCP))
    {
        Log(("rdtscp: Not enabled for VMX non-root mode -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }

    if (pVCpu->iem.s.uCpl != 0)
    {
        IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR4);
        if (pVCpu->cpum.GstCtx.cr4 & X86_CR4_TSD)
        {
            Log(("rdtscp: CR4.TSD and CPL=%u -> #GP(0)\n", pVCpu->iem.s.uCpl));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }
    }

    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_RDTSC_EXIT))
    {
        Log(("rdtscp: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_RDTSCP, cbInstr);
    }
    else if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_RDTSCP))
    {
        Log(("rdtscp: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_RDTSCP, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * Do the job.
     * Query the MSR first in case of trips to ring-3.
     */
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_TSC_AUX);
    VBOXSTRICTRC rcStrict = CPUMQueryGuestMsr(pVCpu, MSR_K8_TSC_AUX, &pVCpu->cpum.GstCtx.rcx);
    if (rcStrict == VINF_SUCCESS)
    {
        /* Low dword of the TSC_AUX msr only. */
        pVCpu->cpum.GstCtx.rcx &= UINT32_C(0xffffffff);

        uint64_t uTicks = TMCpuTickGet(pVCpu);
#if defined(VBOX_WITH_NESTED_HWVIRT_SVM) || defined(VBOX_WITH_NESTED_HWVIRT_VMX)
        uTicks = CPUMApplyNestedGuestTscOffset(pVCpu, uTicks);
#endif
        pVCpu->cpum.GstCtx.rax = RT_LO_U32(uTicks);
        pVCpu->cpum.GstCtx.rdx = RT_HI_U32(uTicks);
        pVCpu->cpum.GstCtx.fExtrn &= ~(CPUMCTX_EXTRN_RAX | CPUMCTX_EXTRN_RDX | CPUMCTX_EXTRN_RCX); /* For IEMExecDecodedRdtscp. */
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }
    return rcStrict;
}


/**
 * Implements RDPMC.
 */
IEM_CIMPL_DEF_0(iemCImpl_rdpmc)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR4);

    if (   pVCpu->iem.s.uCpl != 0
        && !(pVCpu->cpum.GstCtx.cr4 & X86_CR4_PCE))
        return iemRaiseGeneralProtectionFault0(pVCpu);

    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_RDPMC_EXIT))
    {
        Log(("rdpmc: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_RDPMC, cbInstr);
    }

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_RDPMC))
    {
        Log(("rdpmc: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_RDPMC, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /** @todo Emulate performance counters, for now just return 0. */
    pVCpu->cpum.GstCtx.rax = 0;
    pVCpu->cpum.GstCtx.rdx = 0;
    pVCpu->cpum.GstCtx.fExtrn &= ~(CPUMCTX_EXTRN_RAX | CPUMCTX_EXTRN_RDX);
    /** @todo We should trigger a \#GP here if the CPU doesn't support the index in
     *        ecx but see @bugref{3472}! */

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements RDMSR.
 */
IEM_CIMPL_DEF_0(iemCImpl_rdmsr)
{
    /*
     * Check preconditions.
     */
    if (!IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fMsr)
        return iemRaiseUndefinedOpcode(pVCpu);
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);

    /*
     * Check nested-guest intercepts.
     */
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        if (iemVmxIsRdmsrWrmsrInterceptSet(pVCpu, VMX_EXIT_RDMSR, pVCpu->cpum.GstCtx.ecx))
            IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_RDMSR, cbInstr);
    }
#endif

#ifdef VBOX_WITH_NESTED_HWVIRT_SVM
    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_MSR_PROT))
    {
        VBOXSTRICTRC rcStrict = iemSvmHandleMsrIntercept(pVCpu, pVCpu->cpum.GstCtx.ecx, false /* fWrite */);
        if (rcStrict == VINF_SVM_VMEXIT)
            return VINF_SUCCESS;
        if (rcStrict != VINF_SVM_INTERCEPT_NOT_ACTIVE)
        {
            Log(("IEM: SVM intercepted rdmsr(%#x) failed. rc=%Rrc\n", pVCpu->cpum.GstCtx.ecx, VBOXSTRICTRC_VAL(rcStrict)));
            return rcStrict;
        }
    }
#endif

    /*
     * Do the job.
     */
    RTUINT64U uValue;
    /** @todo make CPUMAllMsrs.cpp import the necessary MSR state. */
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_ALL_MSRS);

    VBOXSTRICTRC rcStrict = CPUMQueryGuestMsr(pVCpu, pVCpu->cpum.GstCtx.ecx, &uValue.u);
    if (rcStrict == VINF_SUCCESS)
    {
        pVCpu->cpum.GstCtx.rax = uValue.s.Lo;
        pVCpu->cpum.GstCtx.rdx = uValue.s.Hi;
        pVCpu->cpum.GstCtx.fExtrn &= ~(CPUMCTX_EXTRN_RAX | CPUMCTX_EXTRN_RDX);

        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }

#ifndef IN_RING3
    /* Deferred to ring-3. */
    if (rcStrict == VINF_CPUM_R3_MSR_READ)
    {
        Log(("IEM: rdmsr(%#x) -> ring-3\n", pVCpu->cpum.GstCtx.ecx));
        return rcStrict;
    }
#endif

    /* Often a unimplemented MSR or MSR bit, so worth logging. */
    if (pVCpu->iem.s.cLogRelRdMsr < 32)
    {
        pVCpu->iem.s.cLogRelRdMsr++;
        LogRel(("IEM: rdmsr(%#x) -> #GP(0)\n", pVCpu->cpum.GstCtx.ecx));
    }
    else
        Log((   "IEM: rdmsr(%#x) -> #GP(0)\n", pVCpu->cpum.GstCtx.ecx));
    AssertMsgReturn(rcStrict == VERR_CPUM_RAISE_GP_0, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict)), VERR_IPE_UNEXPECTED_STATUS);
    return iemRaiseGeneralProtectionFault0(pVCpu);
}


/**
 * Implements WRMSR.
 */
IEM_CIMPL_DEF_0(iemCImpl_wrmsr)
{
    /*
     * Check preconditions.
     */
    if (!IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fMsr)
        return iemRaiseUndefinedOpcode(pVCpu);
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);

    RTUINT64U uValue;
    uValue.s.Lo = pVCpu->cpum.GstCtx.eax;
    uValue.s.Hi = pVCpu->cpum.GstCtx.edx;

    uint32_t const idMsr = pVCpu->cpum.GstCtx.ecx;

    /** @todo make CPUMAllMsrs.cpp import the necessary MSR state. */
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_ALL_MSRS);

    /*
     * Check nested-guest intercepts.
     */
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        if (iemVmxIsRdmsrWrmsrInterceptSet(pVCpu, VMX_EXIT_WRMSR, idMsr))
            IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_WRMSR, cbInstr);
    }
#endif

#ifdef VBOX_WITH_NESTED_HWVIRT_SVM
    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_MSR_PROT))
    {
        VBOXSTRICTRC rcStrict = iemSvmHandleMsrIntercept(pVCpu, idMsr, true /* fWrite */);
        if (rcStrict == VINF_SVM_VMEXIT)
            return VINF_SUCCESS;
        if (rcStrict != VINF_SVM_INTERCEPT_NOT_ACTIVE)
        {
            Log(("IEM: SVM intercepted rdmsr(%#x) failed. rc=%Rrc\n", idMsr, VBOXSTRICTRC_VAL(rcStrict)));
            return rcStrict;
        }
    }
#endif

    /*
     * Do the job.
     */
    VBOXSTRICTRC rcStrict = CPUMSetGuestMsr(pVCpu, idMsr, uValue.u);
    if (rcStrict == VINF_SUCCESS)
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);

#ifndef IN_RING3
    /* Deferred to ring-3. */
    if (rcStrict == VINF_CPUM_R3_MSR_WRITE)
    {
        Log(("IEM: wrmsr(%#x) -> ring-3\n", idMsr));
        return rcStrict;
    }
#endif

    /* Often a unimplemented MSR or MSR bit, so worth logging. */
    if (pVCpu->iem.s.cLogRelWrMsr < 32)
    {
        pVCpu->iem.s.cLogRelWrMsr++;
        LogRel(("IEM: wrmsr(%#x,%#x`%08x) -> #GP(0)\n", idMsr, uValue.s.Hi, uValue.s.Lo));
    }
    else
        Log((   "IEM: wrmsr(%#x,%#x`%08x) -> #GP(0)\n", idMsr, uValue.s.Hi, uValue.s.Lo));
    AssertMsgReturn(rcStrict == VERR_CPUM_RAISE_GP_0, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict)), VERR_IPE_UNEXPECTED_STATUS);
    return iemRaiseGeneralProtectionFault0(pVCpu);
}


/**
 * Implements 'IN eAX, port'.
 *
 * @param   u16Port     The source port.
 * @param   fImm        Whether the port was specified through an immediate operand
 *                      or the implicit DX register.
 * @param   cbReg       The register size.
 */
IEM_CIMPL_DEF_3(iemCImpl_in, uint16_t, u16Port, bool, fImm, uint8_t, cbReg)
{
    /*
     * CPL check
     */
    VBOXSTRICTRC rcStrict = iemHlpCheckPortIOPermission(pVCpu, u16Port, cbReg);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /*
     * Check VMX nested-guest IO intercept.
     */
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        rcStrict = iemVmxVmexitInstrIo(pVCpu, VMXINSTRID_IO_IN, u16Port, fImm, cbReg, cbInstr);
        if (rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE)
            return rcStrict;
    }
#else
    RT_NOREF(fImm);
#endif

    /*
     * Check SVM nested-guest IO intercept.
     */
#ifdef VBOX_WITH_NESTED_HWVIRT_SVM
    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_IOIO_PROT))
    {
        uint8_t cAddrSizeBits;
        switch (pVCpu->iem.s.enmEffAddrMode)
        {
            case IEMMODE_16BIT: cAddrSizeBits = 16; break;
            case IEMMODE_32BIT: cAddrSizeBits = 32; break;
            case IEMMODE_64BIT: cAddrSizeBits = 64; break;
            IEM_NOT_REACHED_DEFAULT_CASE_RET();
        }
        rcStrict = iemSvmHandleIOIntercept(pVCpu, u16Port, SVMIOIOTYPE_IN, cbReg, cAddrSizeBits, 0 /* N/A - iEffSeg */,
                                           false /* fRep */, false /* fStrIo */, cbInstr);
        if (rcStrict == VINF_SVM_VMEXIT)
            return VINF_SUCCESS;
        if (rcStrict != VINF_SVM_INTERCEPT_NOT_ACTIVE)
        {
            Log(("iemCImpl_in: iemSvmHandleIOIntercept failed (u16Port=%#x, cbReg=%u) rc=%Rrc\n", u16Port, cbReg,
                 VBOXSTRICTRC_VAL(rcStrict)));
            return rcStrict;
        }
    }
#endif

    /*
     * Perform the I/O.
     */
    PVMCC const pVM      = pVCpu->CTX_SUFF(pVM);
    uint32_t    u32Value = 0;
    rcStrict = IOMIOPortRead(pVM, pVCpu, u16Port, &u32Value, cbReg);
    if (IOM_SUCCESS(rcStrict))
    {
        switch (cbReg)
        {
            case 1: pVCpu->cpum.GstCtx.al  = (uint8_t)u32Value;  break;
            case 2: pVCpu->cpum.GstCtx.ax  = (uint16_t)u32Value; break;
            case 4: pVCpu->cpum.GstCtx.rax = u32Value;           break;
            default: AssertFailedReturn(VERR_IEM_IPE_3);
        }

        pVCpu->iem.s.cPotentialExits++;
        if (rcStrict != VINF_SUCCESS)
            iemSetPassUpStatus(pVCpu, rcStrict);

        /*
         * Check for I/O breakpoints before we complete the instruction.
         */
        uint32_t const fDr7 = pVCpu->cpum.GstCtx.dr[7];
        if (RT_UNLIKELY(   (   (   (fDr7 & X86_DR7_ENABLED_MASK)
                                && X86_DR7_ANY_RW_IO(fDr7)
                                && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_DE))
                            || pVM->dbgf.ro.cEnabledHwIoBreakpoints > 0)
                        && rcStrict == VINF_SUCCESS))
        {
            IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_DR0_DR3 | CPUMCTX_EXTRN_DR6);
            pVCpu->cpum.GstCtx.eflags.uBoth |= DBGFBpCheckIo2(pVM, pVCpu, u16Port, cbReg);
        }

        rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }

    return rcStrict;
}


/**
 * Implements 'IN eAX, DX'.
 *
 * @param   cbReg       The register size.
 */
IEM_CIMPL_DEF_1(iemCImpl_in_eAX_DX, uint8_t, cbReg)
{
    return IEM_CIMPL_CALL_3(iemCImpl_in, pVCpu->cpum.GstCtx.dx, false /* fImm */, cbReg);
}


/**
 * Implements 'OUT port, eAX'.
 *
 * @param   u16Port     The destination port.
 * @param   fImm        Whether the port was specified through an immediate operand
 *                      or the implicit DX register.
 * @param   cbReg       The register size.
 */
IEM_CIMPL_DEF_3(iemCImpl_out, uint16_t, u16Port, bool, fImm, uint8_t, cbReg)
{
    /*
     * CPL check
     */
    VBOXSTRICTRC rcStrict = iemHlpCheckPortIOPermission(pVCpu, u16Port, cbReg);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /*
     * Check VMX nested-guest I/O intercept.
     */
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        rcStrict = iemVmxVmexitInstrIo(pVCpu, VMXINSTRID_IO_OUT, u16Port, fImm, cbReg, cbInstr);
        if (rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE)
            return rcStrict;
    }
#else
    RT_NOREF(fImm);
#endif

    /*
     * Check SVM nested-guest I/O intercept.
     */
#ifdef VBOX_WITH_NESTED_HWVIRT_SVM
    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_IOIO_PROT))
    {
        uint8_t cAddrSizeBits;
        switch (pVCpu->iem.s.enmEffAddrMode)
        {
            case IEMMODE_16BIT: cAddrSizeBits = 16; break;
            case IEMMODE_32BIT: cAddrSizeBits = 32; break;
            case IEMMODE_64BIT: cAddrSizeBits = 64; break;
            IEM_NOT_REACHED_DEFAULT_CASE_RET();
        }
        rcStrict = iemSvmHandleIOIntercept(pVCpu, u16Port, SVMIOIOTYPE_OUT, cbReg, cAddrSizeBits, 0 /* N/A - iEffSeg */,
                                           false /* fRep */, false /* fStrIo */, cbInstr);
        if (rcStrict == VINF_SVM_VMEXIT)
            return VINF_SUCCESS;
        if (rcStrict != VINF_SVM_INTERCEPT_NOT_ACTIVE)
        {
            Log(("iemCImpl_out: iemSvmHandleIOIntercept failed (u16Port=%#x, cbReg=%u) rc=%Rrc\n", u16Port, cbReg,
                 VBOXSTRICTRC_VAL(rcStrict)));
            return rcStrict;
        }
    }
#endif

    /*
     * Perform the I/O.
     */
    PVMCC const pVM      = pVCpu->CTX_SUFF(pVM);
    uint32_t u32Value;
    switch (cbReg)
    {
        case 1: u32Value = pVCpu->cpum.GstCtx.al;  break;
        case 2: u32Value = pVCpu->cpum.GstCtx.ax;  break;
        case 4: u32Value = pVCpu->cpum.GstCtx.eax; break;
        default: AssertFailedReturn(VERR_IEM_IPE_4);
    }
    rcStrict = IOMIOPortWrite(pVM, pVCpu, u16Port, u32Value, cbReg);
    if (IOM_SUCCESS(rcStrict))
    {
        pVCpu->iem.s.cPotentialExits++;
        if (rcStrict != VINF_SUCCESS)
            iemSetPassUpStatus(pVCpu, rcStrict);

        /*
         * Check for I/O breakpoints before we complete the instruction.
         */
        uint32_t const fDr7 = pVCpu->cpum.GstCtx.dr[7];
        if (RT_UNLIKELY(   (   (   (fDr7 & X86_DR7_ENABLED_MASK)
                                && X86_DR7_ANY_RW_IO(fDr7)
                                && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_DE))
                            || pVM->dbgf.ro.cEnabledHwIoBreakpoints > 0)
                        && rcStrict == VINF_SUCCESS))
        {
            IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_DR0_DR3 | CPUMCTX_EXTRN_DR6);
            pVCpu->cpum.GstCtx.eflags.uBoth |= DBGFBpCheckIo2(pVM, pVCpu, u16Port, cbReg);
        }

        rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }
    return rcStrict;
}


/**
 * Implements 'OUT DX, eAX'.
 *
 * @param   cbReg       The register size.
 */
IEM_CIMPL_DEF_1(iemCImpl_out_DX_eAX, uint8_t, cbReg)
{
    return IEM_CIMPL_CALL_3(iemCImpl_out, pVCpu->cpum.GstCtx.dx, false /* fImm */, cbReg);
}


/**
 * Implements 'CLI'.
 */
IEM_CIMPL_DEF_0(iemCImpl_cli)
{
    uint32_t        fEfl    = IEMMISC_GET_EFL(pVCpu);
#ifdef LOG_ENABLED
    uint32_t const  fEflOld = fEfl;
#endif

    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_CR4);
    if (pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE)
    {
        uint8_t const uIopl = X86_EFL_GET_IOPL(fEfl);
        if (!(fEfl & X86_EFL_VM))
        {
            if (pVCpu->iem.s.uCpl <= uIopl)
                fEfl &= ~X86_EFL_IF;
            else if (   pVCpu->iem.s.uCpl == 3
                     && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_PVI) )
                fEfl &= ~X86_EFL_VIF;
            else
                return iemRaiseGeneralProtectionFault0(pVCpu);
        }
        /* V8086 */
        else if (uIopl == 3)
            fEfl &= ~X86_EFL_IF;
        else if (   uIopl < 3
                 && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_VME) )
            fEfl &= ~X86_EFL_VIF;
        else
            return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    /* real mode */
    else
        fEfl &= ~X86_EFL_IF;

    /* Commit. */
    IEMMISC_SET_EFL(pVCpu, fEfl);
    VBOXSTRICTRC const rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    Log2(("CLI: %#x -> %#x\n", fEflOld, fEfl));
    return rcStrict;
}


/**
 * Implements 'STI'.
 */
IEM_CIMPL_DEF_0(iemCImpl_sti)
{
    uint32_t        fEfl    = IEMMISC_GET_EFL(pVCpu);
    uint32_t const  fEflOld = fEfl;

    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_CR4);
    if (pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE)
    {
        uint8_t const uIopl = X86_EFL_GET_IOPL(fEfl);
        if (!(fEfl & X86_EFL_VM))
        {
            if (pVCpu->iem.s.uCpl <= uIopl)
                fEfl |= X86_EFL_IF;
            else if (   pVCpu->iem.s.uCpl == 3
                     && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_PVI)
                     && !(fEfl & X86_EFL_VIP) )
                fEfl |= X86_EFL_VIF;
            else
                return iemRaiseGeneralProtectionFault0(pVCpu);
        }
        /* V8086 */
        else if (uIopl == 3)
            fEfl |= X86_EFL_IF;
        else if (   uIopl < 3
                 && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_VME)
                 && !(fEfl & X86_EFL_VIP) )
            fEfl |= X86_EFL_VIF;
        else
            return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    /* real mode */
    else
        fEfl |= X86_EFL_IF;

    /*
     * Commit.
     *
     * Note! Setting the shadow interrupt flag must be done after RIP updating.
     */
    IEMMISC_SET_EFL(pVCpu, fEfl);
    VBOXSTRICTRC const rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    if (!(fEflOld & X86_EFL_IF) && (fEfl & X86_EFL_IF))
    {
        /** @todo only set it the shadow flag if it was clear before? */
        CPUMSetInInterruptShadowSti(&pVCpu->cpum.GstCtx);
    }
    Log2(("STI: %#x -> %#x\n", fEflOld, fEfl));
    return rcStrict;
}


/**
 * Implements 'HLT'.
 */
IEM_CIMPL_DEF_0(iemCImpl_hlt)
{
    if (pVCpu->iem.s.uCpl != 0)
        return iemRaiseGeneralProtectionFault0(pVCpu);

    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_HLT_EXIT))
    {
        Log2(("hlt: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_HLT, cbInstr);
    }

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_HLT))
    {
        Log2(("hlt: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_HLT, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /** @todo finish: This ASSUMES that iemRegAddToRipAndFinishingClearingRF won't
     * be returning any status codes relating to non-guest events being raised, as
     * we'll mess up the guest HALT otherwise.  */
    VBOXSTRICTRC rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    if (rcStrict == VINF_SUCCESS)
        rcStrict = VINF_EM_HALT;
    return rcStrict;
}


/**
 * Implements 'MONITOR'.
 */
IEM_CIMPL_DEF_1(iemCImpl_monitor, uint8_t, iEffSeg)
{
    /*
     * Permission checks.
     */
    if (pVCpu->iem.s.uCpl != 0)
    {
        Log2(("monitor: CPL != 0\n"));
        return iemRaiseUndefinedOpcode(pVCpu); /** @todo MSR[0xC0010015].MonMwaitUserEn if we care. */
    }
    if (!IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fMonitorMWait)
    {
        Log2(("monitor: Not in CPUID\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }

    /*
     * Check VMX guest-intercept.
     * This should be considered a fault-like VM-exit.
     * See Intel spec. 25.1.1 "Relative Priority of Faults and VM Exits".
     */
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_MONITOR_EXIT))
    {
        Log2(("monitor: Guest intercept -> #VMEXIT\n"));
        IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_MONITOR, cbInstr);
    }

    /*
     * Gather the operands and validate them.
     */
    RTGCPTR  GCPtrMem   = pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT ? pVCpu->cpum.GstCtx.rax : pVCpu->cpum.GstCtx.eax;
    uint32_t uEcx       = pVCpu->cpum.GstCtx.ecx;
    uint32_t uEdx       = pVCpu->cpum.GstCtx.edx;
/** @todo Test whether EAX or ECX is processed first, i.e. do we get \#PF or
 *        \#GP first. */
    if (uEcx != 0)
    {
        Log2(("monitor rax=%RX64, ecx=%RX32, edx=%RX32; ECX != 0 -> #GP(0)\n", GCPtrMem, uEcx, uEdx)); NOREF(uEdx);
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    VBOXSTRICTRC rcStrict = iemMemApplySegment(pVCpu, IEM_ACCESS_TYPE_READ | IEM_ACCESS_WHAT_DATA, iEffSeg, 1, &GCPtrMem);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    RTGCPHYS GCPhysMem;
    /** @todo access size   */
    rcStrict = iemMemPageTranslateAndCheckAccess(pVCpu, GCPtrMem, 1, IEM_ACCESS_TYPE_READ | IEM_ACCESS_WHAT_DATA, &GCPhysMem);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_VIRT_APIC_ACCESS))
    {
        /*
         * MONITOR does not access the memory, just monitors the address. However,
         * if the address falls in the APIC-access page, the address monitored must
         * instead be the corresponding address in the virtual-APIC page.
         *
         * See Intel spec. 29.4.4 "Instruction-Specific Considerations".
         */
        rcStrict = iemVmxVirtApicAccessUnused(pVCpu, &GCPhysMem, 1, IEM_ACCESS_TYPE_READ | IEM_ACCESS_WHAT_DATA);
        if (   rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE
            && rcStrict != VINF_VMX_MODIFIES_BEHAVIOR)
                return rcStrict;
    }
#endif

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_MONITOR))
    {
        Log2(("monitor: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_MONITOR, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * Call EM to prepare the monitor/wait.
     */
    rcStrict = EMMonitorWaitPrepare(pVCpu, pVCpu->cpum.GstCtx.rax, pVCpu->cpum.GstCtx.rcx, pVCpu->cpum.GstCtx.rdx, GCPhysMem);
    Assert(rcStrict == VINF_SUCCESS);
    if (rcStrict == VINF_SUCCESS)
        rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    return rcStrict;
}


/**
 * Implements 'MWAIT'.
 */
IEM_CIMPL_DEF_0(iemCImpl_mwait)
{
    /*
     * Permission checks.
     */
    if (pVCpu->iem.s.uCpl != 0)
    {
        Log2(("mwait: CPL != 0\n"));
        /** @todo MSR[0xC0010015].MonMwaitUserEn if we care. (Remember to check
         *        EFLAGS.VM then.) */
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (!IEM_GET_GUEST_CPU_FEATURES(pVCpu)->fMonitorMWait)
    {
        Log2(("mwait: Not in CPUID\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }

    /* Check VMX nested-guest intercept. */
    if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_MWAIT_EXIT))
        IEM_VMX_VMEXIT_MWAIT_RET(pVCpu, EMMonitorIsArmed(pVCpu), cbInstr);

    /*
     * Gather the operands and validate them.
     */
    uint32_t const uEax = pVCpu->cpum.GstCtx.eax;
    uint32_t const uEcx = pVCpu->cpum.GstCtx.ecx;
    if (uEcx != 0)
    {
        /* Only supported extension is break on IRQ when IF=0. */
        if (uEcx > 1)
        {
            Log2(("mwait eax=%RX32, ecx=%RX32; ECX > 1 -> #GP(0)\n", uEax, uEcx));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }
        uint32_t fMWaitFeatures = 0;
        uint32_t uIgnore = 0;
        CPUMGetGuestCpuId(pVCpu, 5, 0, -1 /*f64BitMode*/, &uIgnore, &uIgnore, &fMWaitFeatures, &uIgnore);
        if (    (fMWaitFeatures & (X86_CPUID_MWAIT_ECX_EXT | X86_CPUID_MWAIT_ECX_BREAKIRQIF0))
            !=                    (X86_CPUID_MWAIT_ECX_EXT | X86_CPUID_MWAIT_ECX_BREAKIRQIF0))
        {
            Log2(("mwait eax=%RX32, ecx=%RX32; break-on-IRQ-IF=0 extension not enabled -> #GP(0)\n", uEax, uEcx));
            return iemRaiseGeneralProtectionFault0(pVCpu);
        }

#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
        /*
         * If the interrupt-window exiting control is set or a virtual-interrupt is pending
         * for delivery; and interrupts are disabled the processor does not enter its
         * mwait state but rather passes control to the next instruction.
         *
         * See Intel spec. 25.3 "Changes to Instruction Behavior In VMX Non-root Operation".
         */
        if (    IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
            && !pVCpu->cpum.GstCtx.eflags.Bits.u1IF)
        {
            if (   IEM_VMX_IS_PROCCTLS_SET(pVCpu, VMX_PROC_CTLS_INT_WINDOW_EXIT)
                || VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NESTED_GUEST))
                /** @todo finish: check up this out after we move int window stuff out of the
                 *        run loop and into the instruction finishing logic here. */
                return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }
#endif
    }

    /*
     * Check SVM nested-guest mwait intercepts.
     */
    if (   IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_MWAIT_ARMED)
        && EMMonitorIsArmed(pVCpu))
    {
        Log2(("mwait: Guest intercept (monitor hardware armed) -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_MWAIT_ARMED, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }
    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_MWAIT))
    {
        Log2(("mwait: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_MWAIT, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }

    /*
     * Call EM to prepare the monitor/wait.
     *
     * This will return VINF_EM_HALT. If there the trap flag is set, we may
     * override it when executing iemRegAddToRipAndFinishingClearingRF ASSUMING
     * that will only return guest related events.
     */
    VBOXSTRICTRC rcStrict = EMMonitorWaitPerform(pVCpu, uEax, uEcx);

    /** @todo finish: This needs more thinking as we should suppress internal
     * debugger events here, or we'll bugger up the guest state even more than we
     * alread do around VINF_EM_HALT. */
    VBOXSTRICTRC rcStrict2 = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    if (rcStrict2 != VINF_SUCCESS)
    {
        Log2(("mwait: %Rrc (perform) -> %Rrc (finish)!\n", VBOXSTRICTRC_VAL(rcStrict), VBOXSTRICTRC_VAL(rcStrict2) ));
        rcStrict = rcStrict2;
    }

    return rcStrict;
}


/**
 * Implements 'SWAPGS'.
 */
IEM_CIMPL_DEF_0(iemCImpl_swapgs)
{
    Assert(pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT); /* Caller checks this. */

    /*
     * Permission checks.
     */
    if (pVCpu->iem.s.uCpl != 0)
    {
        Log2(("swapgs: CPL != 0\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }

    /*
     * Do the job.
     */
    IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_KERNEL_GS_BASE | CPUMCTX_EXTRN_GS);
    uint64_t uOtherGsBase = pVCpu->cpum.GstCtx.msrKERNELGSBASE;
    pVCpu->cpum.GstCtx.msrKERNELGSBASE = pVCpu->cpum.GstCtx.gs.u64Base;
    pVCpu->cpum.GstCtx.gs.u64Base = uOtherGsBase;

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


#ifndef VBOX_WITHOUT_CPUID_HOST_CALL
/**
 * Handles a CPUID call.
 */
static VBOXSTRICTRC iemCpuIdVBoxCall(PVMCPUCC pVCpu, uint32_t iFunction,
                                     uint32_t *pEax, uint32_t *pEbx, uint32_t *pEcx, uint32_t *pEdx)
{
    switch (iFunction)
    {
        case VBOX_CPUID_FN_ID:
            LogFlow(("iemCpuIdVBoxCall: VBOX_CPUID_FN_ID\n"));
            *pEax = VBOX_CPUID_RESP_ID_EAX;
            *pEbx = VBOX_CPUID_RESP_ID_EBX;
            *pEcx = VBOX_CPUID_RESP_ID_ECX;
            *pEdx = VBOX_CPUID_RESP_ID_EDX;
            break;

        case VBOX_CPUID_FN_LOG:
        {
            CPUM_IMPORT_EXTRN_RET(pVCpu, CPUMCTX_EXTRN_RDX | CPUMCTX_EXTRN_RBX | CPUMCTX_EXTRN_RSI
                                       | IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK);

            /* Validate input. */
            uint32_t cchToLog = *pEdx;
            if (cchToLog <= _2M)
            {
                uint32_t const uLogPicker = *pEbx;
                if (uLogPicker <= 1)
                {
                    /* Resolve the logger. */
                    PRTLOGGER const pLogger = !uLogPicker
                                            ? RTLogDefaultInstanceEx(UINT32_MAX) : RTLogRelGetDefaultInstanceEx(UINT32_MAX);
                    if (pLogger)
                    {
                        /* Copy over the data: */
                        RTGCPTR GCPtrSrc = pVCpu->cpum.GstCtx.rsi;
                        while (cchToLog > 0)
                        {
                            uint32_t cbToMap = GUEST_PAGE_SIZE - (GCPtrSrc & GUEST_PAGE_OFFSET_MASK);
                            if (cbToMap > cchToLog)
                                cbToMap = cchToLog;
                            /** @todo Extend iemMemMap to allowing page size accessing and avoid 7
                             *        unnecessary calls & iterations per pages. */
                            if (cbToMap > 512)
                                cbToMap = 512;
                            void        *pvSrc    = NULL;
                            VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, &pvSrc, cbToMap, UINT8_MAX, GCPtrSrc, IEM_ACCESS_DATA_R, 0);
                            if (rcStrict == VINF_SUCCESS)
                            {
                                RTLogBulkNestedWrite(pLogger, (const char *)pvSrc, cbToMap, "Gst:");
                                rcStrict = iemMemCommitAndUnmap(pVCpu, pvSrc, IEM_ACCESS_DATA_R);
                                AssertRCSuccessReturn(VBOXSTRICTRC_VAL(rcStrict), rcStrict);
                            }
                            else
                            {
                                Log(("iemCpuIdVBoxCall: %Rrc at %RGp LB %#x\n", VBOXSTRICTRC_VAL(rcStrict), GCPtrSrc, cbToMap));
                                return rcStrict;
                            }

                            /* Advance. */
                            pVCpu->cpum.GstCtx.rsi = GCPtrSrc += cbToMap;
                            *pEdx                  = cchToLog -= cbToMap;
                        }
                        *pEax = VINF_SUCCESS;
                    }
                    else
                        *pEax = (uint32_t)VERR_NOT_FOUND;
                }
                else
                    *pEax = (uint32_t)VERR_NOT_FOUND;
            }
            else
                *pEax = (uint32_t)VERR_TOO_MUCH_DATA;
            *pEdx = VBOX_CPUID_RESP_GEN_EDX;
            *pEcx = VBOX_CPUID_RESP_GEN_ECX;
            *pEbx = VBOX_CPUID_RESP_GEN_EBX;
            break;
        }

        default:
            LogFlow(("iemCpuIdVBoxCall: Invalid function %#x (%#x, %#x)\n", iFunction, *pEbx, *pEdx));
            *pEax = (uint32_t)VERR_INVALID_FUNCTION;
            *pEbx = (uint32_t)VERR_INVALID_FUNCTION;
            *pEcx = (uint32_t)VERR_INVALID_FUNCTION;
            *pEdx = (uint32_t)VERR_INVALID_FUNCTION;
            break;
    }
    return VINF_SUCCESS;
}
#endif /* VBOX_WITHOUT_CPUID_HOST_CALL */

/**
 * Implements 'CPUID'.
 */
IEM_CIMPL_DEF_0(iemCImpl_cpuid)
{
    if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
    {
        Log2(("cpuid: Guest intercept -> VM-exit\n"));
        IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_CPUID, cbInstr);
    }

    if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_CPUID))
    {
        Log2(("cpuid: Guest intercept -> #VMEXIT\n"));
        IEM_SVM_UPDATE_NRIP(pVCpu);
        IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_CPUID, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
    }


    uint32_t const uEax = pVCpu->cpum.GstCtx.eax;
    uint32_t const uEcx = pVCpu->cpum.GstCtx.ecx;

#ifndef VBOX_WITHOUT_CPUID_HOST_CALL
    /*
     * CPUID host call backdoor.
     */
    if (   uEax == VBOX_CPUID_REQ_EAX_FIXED
        && (uEcx & VBOX_CPUID_REQ_ECX_FIXED_MASK) == VBOX_CPUID_REQ_ECX_FIXED
        && pVCpu->CTX_SUFF(pVM)->iem.s.fCpuIdHostCall)
    {
        VBOXSTRICTRC rcStrict = iemCpuIdVBoxCall(pVCpu, uEcx & VBOX_CPUID_REQ_ECX_FN_MASK,
                                                 &pVCpu->cpum.GstCtx.eax, &pVCpu->cpum.GstCtx.ebx,
                                                 &pVCpu->cpum.GstCtx.ecx, &pVCpu->cpum.GstCtx.edx);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
    }
    /*
     * Regular CPUID.
     */
    else
#endif
        CPUMGetGuestCpuId(pVCpu, uEax, uEcx, pVCpu->cpum.GstCtx.cs.Attr.n.u1Long,
                          &pVCpu->cpum.GstCtx.eax, &pVCpu->cpum.GstCtx.ebx, &pVCpu->cpum.GstCtx.ecx, &pVCpu->cpum.GstCtx.edx);

    pVCpu->cpum.GstCtx.rax &= UINT32_C(0xffffffff);
    pVCpu->cpum.GstCtx.rbx &= UINT32_C(0xffffffff);
    pVCpu->cpum.GstCtx.rcx &= UINT32_C(0xffffffff);
    pVCpu->cpum.GstCtx.rdx &= UINT32_C(0xffffffff);
    pVCpu->cpum.GstCtx.fExtrn &= ~(CPUMCTX_EXTRN_RAX | CPUMCTX_EXTRN_RCX | CPUMCTX_EXTRN_RDX | CPUMCTX_EXTRN_RBX);

    pVCpu->iem.s.cPotentialExits++;
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'AAD'.
 *
 * @param   bImm            The immediate operand.
 */
IEM_CIMPL_DEF_1(iemCImpl_aad, uint8_t, bImm)
{
    uint16_t const ax = pVCpu->cpum.GstCtx.ax;
    uint8_t const  al = (uint8_t)ax + (uint8_t)(ax >> 8) * bImm;
    pVCpu->cpum.GstCtx.ax = al;
    iemHlpUpdateArithEFlagsU8(pVCpu, al,
                              X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF,
                              X86_EFL_OF | X86_EFL_AF | X86_EFL_CF);

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'AAM'.
 *
 * @param   bImm            The immediate operand. Cannot be 0.
 */
IEM_CIMPL_DEF_1(iemCImpl_aam, uint8_t, bImm)
{
    Assert(bImm != 0); /* #DE on 0 is handled in the decoder. */

    uint16_t const ax = pVCpu->cpum.GstCtx.ax;
    uint8_t const  al = (uint8_t)ax % bImm;
    uint8_t const  ah = (uint8_t)ax / bImm;
    pVCpu->cpum.GstCtx.ax = (ah << 8) + al;
    iemHlpUpdateArithEFlagsU8(pVCpu, al,
                              X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF,
                              X86_EFL_OF | X86_EFL_AF | X86_EFL_CF);

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'DAA'.
 */
IEM_CIMPL_DEF_0(iemCImpl_daa)
{
    uint8_t const  al       = pVCpu->cpum.GstCtx.al;
    bool const     fCarry   = pVCpu->cpum.GstCtx.eflags.Bits.u1CF;

    if (   pVCpu->cpum.GstCtx.eflags.Bits.u1AF
        || (al & 0xf) >= 10)
    {
        pVCpu->cpum.GstCtx.al = al + 6;
        pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 1;
    }
    else
        pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 0;

    if (al >= 0x9a || fCarry)
    {
        pVCpu->cpum.GstCtx.al += 0x60;
        pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 1;
    }
    else
        pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 0;

    iemHlpUpdateArithEFlagsU8(pVCpu, pVCpu->cpum.GstCtx.al, X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF, X86_EFL_OF);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'DAS'.
 */
IEM_CIMPL_DEF_0(iemCImpl_das)
{
    uint8_t const  uInputAL = pVCpu->cpum.GstCtx.al;
    bool const     fCarry   = pVCpu->cpum.GstCtx.eflags.Bits.u1CF;

    if (   pVCpu->cpum.GstCtx.eflags.Bits.u1AF
        || (uInputAL & 0xf) >= 10)
    {
        pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 1;
        if (uInputAL < 6)
            pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 1;
        pVCpu->cpum.GstCtx.al = uInputAL - 6;
    }
    else
    {
        pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 0;
        pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 0;
    }

    if (uInputAL >= 0x9a || fCarry)
    {
        pVCpu->cpum.GstCtx.al -= 0x60;
        pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 1;
    }

    iemHlpUpdateArithEFlagsU8(pVCpu, pVCpu->cpum.GstCtx.al, X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF, X86_EFL_OF);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'AAA'.
 */
IEM_CIMPL_DEF_0(iemCImpl_aaa)
{
    if (IEM_IS_GUEST_CPU_AMD(pVCpu))
    {
        if (   pVCpu->cpum.GstCtx.eflags.Bits.u1AF
            || (pVCpu->cpum.GstCtx.ax & 0xf) >= 10)
        {
            iemAImpl_add_u16(&pVCpu->cpum.GstCtx.ax, 0x106, &pVCpu->cpum.GstCtx.eflags.uBoth);
            pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 1;
            pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 1;
        }
        else
        {
            iemHlpUpdateArithEFlagsU16(pVCpu, pVCpu->cpum.GstCtx.ax, X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF, X86_EFL_OF);
            pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 0;
            pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 0;
        }
        pVCpu->cpum.GstCtx.ax &= UINT16_C(0xff0f);
    }
    else
    {
        if (   pVCpu->cpum.GstCtx.eflags.Bits.u1AF
            || (pVCpu->cpum.GstCtx.ax & 0xf) >= 10)
        {
            pVCpu->cpum.GstCtx.ax += UINT16_C(0x106);
            pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 1;
            pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 1;
        }
        else
        {
            pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 0;
            pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 0;
        }
        pVCpu->cpum.GstCtx.ax &= UINT16_C(0xff0f);
        iemHlpUpdateArithEFlagsU8(pVCpu, pVCpu->cpum.GstCtx.al, X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF, X86_EFL_OF);
    }

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'AAS'.
 */
IEM_CIMPL_DEF_0(iemCImpl_aas)
{
    if (IEM_IS_GUEST_CPU_AMD(pVCpu))
    {
        if (   pVCpu->cpum.GstCtx.eflags.Bits.u1AF
            || (pVCpu->cpum.GstCtx.ax & 0xf) >= 10)
        {
            iemAImpl_sub_u16(&pVCpu->cpum.GstCtx.ax, 0x106, &pVCpu->cpum.GstCtx.eflags.uBoth);
            pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 1;
            pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 1;
        }
        else
        {
            iemHlpUpdateArithEFlagsU16(pVCpu, pVCpu->cpum.GstCtx.ax, X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF, X86_EFL_OF);
            pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 0;
            pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 0;
        }
        pVCpu->cpum.GstCtx.ax &= UINT16_C(0xff0f);
    }
    else
    {
        if (   pVCpu->cpum.GstCtx.eflags.Bits.u1AF
            || (pVCpu->cpum.GstCtx.ax & 0xf) >= 10)
        {
            pVCpu->cpum.GstCtx.ax -= UINT16_C(0x106);
            pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 1;
            pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 1;
        }
        else
        {
            pVCpu->cpum.GstCtx.eflags.Bits.u1AF = 0;
            pVCpu->cpum.GstCtx.eflags.Bits.u1CF = 0;
        }
        pVCpu->cpum.GstCtx.ax &= UINT16_C(0xff0f);
        iemHlpUpdateArithEFlagsU8(pVCpu, pVCpu->cpum.GstCtx.al, X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF, X86_EFL_OF);
    }

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements the 16-bit version of 'BOUND'.
 *
 * @note    We have separate 16-bit and 32-bit variants of this function due to
 *          the decoder using unsigned parameters, whereas we want signed one to
 *          do the job.  This is significant for a recompiler.
 */
IEM_CIMPL_DEF_3(iemCImpl_bound_16, int16_t, idxArray, int16_t, idxLowerBound, int16_t, idxUpperBound)
{
    /*
     * Check if the index is inside the bounds, otherwise raise #BR.
     */
    if (   idxArray >= idxLowerBound
        && idxArray <= idxUpperBound)
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    return iemRaiseBoundRangeExceeded(pVCpu);
}


/**
 * Implements the 32-bit version of 'BOUND'.
 */
IEM_CIMPL_DEF_3(iemCImpl_bound_32, int32_t, idxArray, int32_t, idxLowerBound, int32_t, idxUpperBound)
{
    /*
     * Check if the index is inside the bounds, otherwise raise #BR.
     */
    if (   idxArray >= idxLowerBound
        && idxArray <= idxUpperBound)
        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    return iemRaiseBoundRangeExceeded(pVCpu);
}


/*
 * Instantiate the various string operation combinations.
 */
#define OP_SIZE     8
#define ADDR_SIZE   16
#include "IEMAllCImplStrInstr.cpp.h"
#define OP_SIZE     8
#define ADDR_SIZE   32
#include "IEMAllCImplStrInstr.cpp.h"
#define OP_SIZE     8
#define ADDR_SIZE   64
#include "IEMAllCImplStrInstr.cpp.h"

#define OP_SIZE     16
#define ADDR_SIZE   16
#include "IEMAllCImplStrInstr.cpp.h"
#define OP_SIZE     16
#define ADDR_SIZE   32
#include "IEMAllCImplStrInstr.cpp.h"
#define OP_SIZE     16
#define ADDR_SIZE   64
#include "IEMAllCImplStrInstr.cpp.h"

#define OP_SIZE     32
#define ADDR_SIZE   16
#include "IEMAllCImplStrInstr.cpp.h"
#define OP_SIZE     32
#define ADDR_SIZE   32
#include "IEMAllCImplStrInstr.cpp.h"
#define OP_SIZE     32
#define ADDR_SIZE   64
#include "IEMAllCImplStrInstr.cpp.h"

#define OP_SIZE     64
#define ADDR_SIZE   32
#include "IEMAllCImplStrInstr.cpp.h"
#define OP_SIZE     64
#define ADDR_SIZE   64
#include "IEMAllCImplStrInstr.cpp.h"


/**
 * Implements 'XGETBV'.
 */
IEM_CIMPL_DEF_0(iemCImpl_xgetbv)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR4);
    if (pVCpu->cpum.GstCtx.cr4 & X86_CR4_OSXSAVE)
    {
        uint32_t uEcx = pVCpu->cpum.GstCtx.ecx;
        switch (uEcx)
        {
            case 0:
                break;

            case 1: /** @todo Implement XCR1 support. */
            default:
                Log(("xgetbv ecx=%RX32 -> #GP(0)\n", uEcx));
                return iemRaiseGeneralProtectionFault0(pVCpu);

        }
        IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_XCRx);
        pVCpu->cpum.GstCtx.rax = RT_LO_U32(pVCpu->cpum.GstCtx.aXcr[uEcx]);
        pVCpu->cpum.GstCtx.rdx = RT_HI_U32(pVCpu->cpum.GstCtx.aXcr[uEcx]);

        return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
    }
    Log(("xgetbv CR4.OSXSAVE=0 -> UD\n"));
    return iemRaiseUndefinedOpcode(pVCpu);
}


/**
 * Implements 'XSETBV'.
 */
IEM_CIMPL_DEF_0(iemCImpl_xsetbv)
{
    if (pVCpu->cpum.GstCtx.cr4 & X86_CR4_OSXSAVE)
    {
        if (IEM_SVM_IS_CTRL_INTERCEPT_SET(pVCpu, SVM_CTRL_INTERCEPT_XSETBV))
        {
            Log2(("xsetbv: Guest intercept -> #VMEXIT\n"));
            IEM_SVM_UPDATE_NRIP(pVCpu);
            IEM_SVM_VMEXIT_RET(pVCpu, SVM_EXIT_XSETBV, 0 /* uExitInfo1 */, 0 /* uExitInfo2 */);
        }

        if (pVCpu->iem.s.uCpl == 0)
        {
            IEM_CTX_IMPORT_RET(pVCpu, CPUMCTX_EXTRN_XCRx);

            if (IEM_VMX_IS_NON_ROOT_MODE(pVCpu))
                IEM_VMX_VMEXIT_INSTR_RET(pVCpu, VMX_EXIT_XSETBV, cbInstr);

            uint32_t uEcx = pVCpu->cpum.GstCtx.ecx;
            uint64_t uNewValue = RT_MAKE_U64(pVCpu->cpum.GstCtx.eax, pVCpu->cpum.GstCtx.edx);
            switch (uEcx)
            {
                case 0:
                {
                    int rc = CPUMSetGuestXcr0(pVCpu, uNewValue);
                    if (rc == VINF_SUCCESS)
                        break;
                    Assert(rc == VERR_CPUM_RAISE_GP_0);
                    Log(("xsetbv ecx=%RX32 (newvalue=%RX64) -> #GP(0)\n", uEcx, uNewValue));
                    return iemRaiseGeneralProtectionFault0(pVCpu);
                }

                case 1: /** @todo Implement XCR1 support. */
                default:
                    Log(("xsetbv ecx=%RX32 (newvalue=%RX64) -> #GP(0)\n", uEcx, uNewValue));
                    return iemRaiseGeneralProtectionFault0(pVCpu);

            }

            return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }

        Log(("xsetbv cpl=%u -> GP(0)\n", pVCpu->iem.s.uCpl));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }
    Log(("xsetbv CR4.OSXSAVE=0 -> UD\n"));
    return iemRaiseUndefinedOpcode(pVCpu);
}

#ifndef RT_ARCH_ARM64
# ifdef IN_RING3

/** Argument package for iemCImpl_cmpxchg16b_fallback_rendezvous_callback. */
struct IEMCIMPLCX16ARGS
{
    PRTUINT128U     pu128Dst;
    PRTUINT128U     pu128RaxRdx;
    PRTUINT128U     pu128RbxRcx;
    uint32_t       *pEFlags;
#  ifdef VBOX_STRICT
    uint32_t        cCalls;
#  endif
};

/**
 * @callback_method_impl{FNVMMEMTRENDEZVOUS,
 *                       Worker for iemCImpl_cmpxchg16b_fallback_rendezvous}
 */
static DECLCALLBACK(VBOXSTRICTRC) iemCImpl_cmpxchg16b_fallback_rendezvous_callback(PVM pVM, PVMCPUCC pVCpu, void *pvUser)
{
    RT_NOREF(pVM, pVCpu);
    struct IEMCIMPLCX16ARGS *pArgs = (struct IEMCIMPLCX16ARGS *)pvUser;
#  ifdef VBOX_STRICT
    Assert(pArgs->cCalls == 0);
    pArgs->cCalls++;
#  endif

    iemAImpl_cmpxchg16b_fallback(pArgs->pu128Dst, pArgs->pu128RaxRdx, pArgs->pu128RbxRcx, pArgs->pEFlags);
    return VINF_SUCCESS;
}

# endif /* IN_RING3 */

/**
 * Implements 'CMPXCHG16B' fallback using rendezvous.
 */
IEM_CIMPL_DEF_4(iemCImpl_cmpxchg16b_fallback_rendezvous, PRTUINT128U, pu128Dst, PRTUINT128U, pu128RaxRdx,
                PRTUINT128U, pu128RbxRcx, uint32_t *, pEFlags)
{
# ifdef IN_RING3
    struct IEMCIMPLCX16ARGS Args;
    Args.pu128Dst       = pu128Dst;
    Args.pu128RaxRdx    = pu128RaxRdx;
    Args.pu128RbxRcx    = pu128RbxRcx;
    Args.pEFlags        = pEFlags;
#  ifdef VBOX_STRICT
    Args.cCalls         = 0;
#  endif
    VBOXSTRICTRC rcStrict = VMMR3EmtRendezvous(pVCpu->CTX_SUFF(pVM), VMMEMTRENDEZVOUS_FLAGS_TYPE_ONCE,
                                               iemCImpl_cmpxchg16b_fallback_rendezvous_callback, &Args);
    Assert(Args.cCalls == 1);
    if (rcStrict == VINF_SUCCESS)
    {
        /* Duplicated tail code. */
        rcStrict = iemMemCommitAndUnmap(pVCpu, pu128Dst, IEM_ACCESS_DATA_RW);
        if (rcStrict == VINF_SUCCESS)
        {
            pVCpu->cpum.GstCtx.eflags.u = *pEFlags; /* IEM_MC_COMMIT_EFLAGS */
            if (!(*pEFlags & X86_EFL_ZF))
            {
                pVCpu->cpum.GstCtx.rax = pu128RaxRdx->s.Lo;
                pVCpu->cpum.GstCtx.rdx = pu128RaxRdx->s.Hi;
            }
            rcStrict = iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }
    }
    return rcStrict;
# else
    RT_NOREF(pVCpu, cbInstr, pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
    return VERR_IEM_ASPECT_NOT_IMPLEMENTED; /* This should get us to ring-3 for now.  Should perhaps be replaced later. */
# endif
}

#endif /* RT_ARCH_ARM64 */

/**
 * Implements 'CLFLUSH' and 'CLFLUSHOPT'.
 *
 * This is implemented in C because it triggers a load like behaviour without
 * actually reading anything.  Since that's not so common, it's implemented
 * here.
 *
 * @param   iEffSeg         The effective segment.
 * @param   GCPtrEff        The address of the image.
 */
IEM_CIMPL_DEF_2(iemCImpl_clflush_clflushopt, uint8_t, iEffSeg, RTGCPTR, GCPtrEff)
{
    /*
     * Pretend to do a load w/o reading (see also iemCImpl_monitor and iemMemMap).
     */
    VBOXSTRICTRC rcStrict = iemMemApplySegment(pVCpu, IEM_ACCESS_TYPE_READ | IEM_ACCESS_WHAT_DATA, iEffSeg, 1, &GCPtrEff);
    if (rcStrict == VINF_SUCCESS)
    {
        RTGCPHYS GCPhysMem;
        /** @todo access size.   */
        rcStrict = iemMemPageTranslateAndCheckAccess(pVCpu, GCPtrEff, 1, IEM_ACCESS_TYPE_READ | IEM_ACCESS_WHAT_DATA, &GCPhysMem);
        if (rcStrict == VINF_SUCCESS)
        {
#ifdef VBOX_WITH_NESTED_HWVIRT_VMX
            if (   IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
                && IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_VIRT_APIC_ACCESS))
            {
                /*
                 * CLFLUSH/CLFLUSHOPT does not access the memory, but flushes the cache-line
                 * that contains the address. However, if the address falls in the APIC-access
                 * page, the address flushed must instead be the corresponding address in the
                 * virtual-APIC page.
                 *
                 * See Intel spec. 29.4.4 "Instruction-Specific Considerations".
                 */
                rcStrict = iemVmxVirtApicAccessUnused(pVCpu, &GCPhysMem, 1, IEM_ACCESS_TYPE_READ | IEM_ACCESS_WHAT_DATA);
                if (   rcStrict != VINF_VMX_INTERCEPT_NOT_ACTIVE
                    && rcStrict != VINF_VMX_MODIFIES_BEHAVIOR)
                    return rcStrict;
            }
#endif
            return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
        }
    }

    return rcStrict;
}


/**
 * Implements 'FINIT' and 'FNINIT'.
 *
 * @param   fCheckXcpts     Whether to check for umasked pending exceptions or
 *                          not.
 */
IEM_CIMPL_DEF_1(iemCImpl_finit, bool, fCheckXcpts)
{
    /*
     * Exceptions.
     */
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0);
    if (pVCpu->cpum.GstCtx.cr0 & (X86_CR0_EM | X86_CR0_TS))
        return iemRaiseDeviceNotAvailable(pVCpu);

    iemFpuActualizeStateForChange(pVCpu);
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_X87);

    /* FINIT: Raise #MF on pending exception(s): */
    if (fCheckXcpts && (pVCpu->cpum.GstCtx.XState.x87.FSW & X86_FSW_ES))
        return iemRaiseMathFault(pVCpu);

    /*
     * Reset the state.
     */
    PX86XSAVEAREA pXState = &pVCpu->cpum.GstCtx.XState;

    /* Rotate the stack to account for changed TOS. */
    iemFpuRotateStackSetTop(&pXState->x87, 0);

    pXState->x87.FCW        = 0x37f;
    pXState->x87.FSW        = 0;
    pXState->x87.FTW        = 0x00;     /* 0 - empty. */
    /** @todo Intel says the instruction and data pointers are not cleared on
     *        387, presume that 8087 and 287 doesn't do so either. */
    /** @todo test this stuff.   */
    if (IEM_GET_TARGET_CPU(pVCpu) > IEMTARGETCPU_386)
    {
        pXState->x87.FPUDP  = 0;
        pXState->x87.DS     = 0; //??
        pXState->x87.Rsrvd2 = 0;
        pXState->x87.FPUIP  = 0;
        pXState->x87.CS     = 0; //??
        pXState->x87.Rsrvd1 = 0;
    }
    pXState->x87.FOP        = 0;

    iemHlpUsedFpu(pVCpu);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'FXSAVE'.
 *
 * @param   iEffSeg         The effective segment.
 * @param   GCPtrEff        The address of the image.
 * @param   enmEffOpSize    The operand size (only REX.W really matters).
 */
IEM_CIMPL_DEF_3(iemCImpl_fxsave, uint8_t, iEffSeg, RTGCPTR, GCPtrEff, IEMMODE, enmEffOpSize)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87 | CPUMCTX_EXTRN_SSE_AVX);

    /*
     * Raise exceptions.
     */
    if (pVCpu->cpum.GstCtx.cr0 & (X86_CR0_TS | X86_CR0_EM))
        return iemRaiseDeviceNotAvailable(pVCpu);

    /*
     * Access the memory.
     */
    void *pvMem512;
    VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, &pvMem512, 512, iEffSeg, GCPtrEff, IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE,
                                      15 | IEM_MEMMAP_F_ALIGN_GP | IEM_MEMMAP_F_ALIGN_GP_OR_AC);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;
    PX86FXSTATE  pDst = (PX86FXSTATE)pvMem512;
    PCX86FXSTATE pSrc = &pVCpu->cpum.GstCtx.XState.x87;

    /*
     * Store the registers.
     */
    /** @todo CPU/VM detection possible! If CR4.OSFXSR=0 MXCSR it's
     * implementation specific whether MXCSR and XMM0-XMM7 are saved. */

    /* common for all formats */
    pDst->FCW           = pSrc->FCW;
    pDst->FSW           = pSrc->FSW;
    pDst->FTW           = pSrc->FTW & UINT16_C(0xff);
    pDst->FOP           = pSrc->FOP;
    pDst->MXCSR         = pSrc->MXCSR;
    pDst->MXCSR_MASK    = CPUMGetGuestMxCsrMask(pVCpu->CTX_SUFF(pVM));
    for (uint32_t i = 0; i < RT_ELEMENTS(pDst->aRegs); i++)
    {
        /** @todo Testcase: What actually happens to the 6 reserved bytes? I'm clearing
         *        them for now... */
        pDst->aRegs[i].au32[0] = pSrc->aRegs[i].au32[0];
        pDst->aRegs[i].au32[1] = pSrc->aRegs[i].au32[1];
        pDst->aRegs[i].au32[2] = pSrc->aRegs[i].au32[2] & UINT32_C(0xffff);
        pDst->aRegs[i].au32[3] = 0;
    }

    /* FPU IP, CS, DP and DS. */
    pDst->FPUIP  = pSrc->FPUIP;
    pDst->CS     = pSrc->CS;
    pDst->FPUDP  = pSrc->FPUDP;
    pDst->DS     = pSrc->DS;
    if (enmEffOpSize == IEMMODE_64BIT)
    {
        /* Save upper 16-bits of FPUIP (IP:CS:Rsvd1) and FPUDP (DP:DS:Rsvd2). */
        pDst->Rsrvd1 = pSrc->Rsrvd1;
        pDst->Rsrvd2 = pSrc->Rsrvd2;
    }
    else
    {
        pDst->Rsrvd1 = 0;
        pDst->Rsrvd2 = 0;
    }

    /* XMM registers. Skipped in 64-bit CPL0 if EFER.FFXSR (AMD only) is set. */
    if (   !(pVCpu->cpum.GstCtx.msrEFER & MSR_K6_EFER_FFXSR)
        || pVCpu->iem.s.enmCpuMode != IEMMODE_64BIT
        || pVCpu->iem.s.uCpl != 0)
    {
        uint32_t cXmmRegs = pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT ? 16 : 8;
        for (uint32_t i = 0; i < cXmmRegs; i++)
            pDst->aXMM[i] = pSrc->aXMM[i];
        /** @todo Testcase: What happens to the reserved XMM registers? Untouched,
         *        right? */
    }

    /*
     * Commit the memory.
     */
    rcStrict = iemMemCommitAndUnmap(pVCpu, pvMem512, IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'FXRSTOR'.
 *
 * @param   iEffSeg         The effective segment register for @a GCPtrEff.
 * @param   GCPtrEff        The address of the image.
 * @param   enmEffOpSize    The operand size (only REX.W really matters).
 */
IEM_CIMPL_DEF_3(iemCImpl_fxrstor, uint8_t, iEffSeg, RTGCPTR, GCPtrEff, IEMMODE, enmEffOpSize)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87 | CPUMCTX_EXTRN_SSE_AVX);

    /*
     * Raise exceptions.
     */
    if (pVCpu->cpum.GstCtx.cr0 & (X86_CR0_TS | X86_CR0_EM))
        return iemRaiseDeviceNotAvailable(pVCpu);

    /*
     * Access the memory.
     */
    void *pvMem512;
    VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, &pvMem512, 512, iEffSeg, GCPtrEff, IEM_ACCESS_DATA_R,
                                      15 | IEM_MEMMAP_F_ALIGN_GP | IEM_MEMMAP_F_ALIGN_GP_OR_AC);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;
    PCX86FXSTATE pSrc = (PCX86FXSTATE)pvMem512;
    PX86FXSTATE  pDst = &pVCpu->cpum.GstCtx.XState.x87;

    /*
     * Check the state for stuff which will #GP(0).
     */
    uint32_t const fMXCSR      = pSrc->MXCSR;
    uint32_t const fMXCSR_MASK = CPUMGetGuestMxCsrMask(pVCpu->CTX_SUFF(pVM));
    if (fMXCSR & ~fMXCSR_MASK)
    {
        Log(("fxrstor: MXCSR=%#x (MXCSR_MASK=%#x) -> #GP(0)\n", fMXCSR, fMXCSR_MASK));
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

    /*
     * Load the registers.
     */
    /** @todo CPU/VM detection possible! If CR4.OSFXSR=0 MXCSR it's
     * implementation specific whether MXCSR and XMM0-XMM7 are
     * restored according to Intel.
     * AMD says MXCSR and XMM registers are never loaded if
     * CR4.OSFXSR=0.
     */

    /* common for all formats */
    pDst->FCW       = pSrc->FCW;
    pDst->FSW       = pSrc->FSW;
    pDst->FTW       = pSrc->FTW & UINT16_C(0xff);
    pDst->FOP       = pSrc->FOP;
    pDst->MXCSR     = fMXCSR;
    /* (MXCSR_MASK is read-only) */
    for (uint32_t i = 0; i < RT_ELEMENTS(pSrc->aRegs); i++)
    {
        pDst->aRegs[i].au32[0] = pSrc->aRegs[i].au32[0];
        pDst->aRegs[i].au32[1] = pSrc->aRegs[i].au32[1];
        pDst->aRegs[i].au32[2] = pSrc->aRegs[i].au32[2] & UINT32_C(0xffff);
        pDst->aRegs[i].au32[3] = 0;
    }

    /* FPU IP, CS, DP and DS. */
    /** @todo AMD says this is only done if FSW.ES is set after loading. */
    if (enmEffOpSize == IEMMODE_64BIT)
    {
        pDst->FPUIP  = pSrc->FPUIP;
        pDst->CS     = pSrc->CS;
        pDst->Rsrvd1 = pSrc->Rsrvd1;
        pDst->FPUDP  = pSrc->FPUDP;
        pDst->DS     = pSrc->DS;
        pDst->Rsrvd2 = pSrc->Rsrvd2;
    }
    else
    {
        pDst->FPUIP  = pSrc->FPUIP;
        pDst->CS     = pSrc->CS;
        pDst->Rsrvd1 = 0;
        pDst->FPUDP  = pSrc->FPUDP;
        pDst->DS     = pSrc->DS;
        pDst->Rsrvd2 = 0;
    }

    /* XMM registers. Skipped in 64-bit CPL0 if EFER.FFXSR (AMD only) is set.
     * Does not affect MXCSR, only registers.
     */
    if (   !(pVCpu->cpum.GstCtx.msrEFER & MSR_K6_EFER_FFXSR)
        || pVCpu->iem.s.enmCpuMode != IEMMODE_64BIT
        || pVCpu->iem.s.uCpl != 0)
    {
        uint32_t cXmmRegs = pVCpu->iem.s.enmCpuMode == IEMMODE_64BIT ? 16 : 8;
        for (uint32_t i = 0; i < cXmmRegs; i++)
            pDst->aXMM[i] = pSrc->aXMM[i];
    }

    pDst->FCW &= ~X86_FCW_ZERO_MASK | X86_FCW_IC_MASK; /* Intel 10980xe allows setting the IC bit. Win 3.11 CALC.EXE sets it. */
    iemFpuRecalcExceptionStatus(pDst);

    if (pDst->FSW & X86_FSW_ES)
        Log11(("fxrstor: %04x:%08RX64: loading state with pending FPU exception (FSW=%#x)\n",
               pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pSrc->FSW));

    /*
     * Unmap the memory.
     */
    rcStrict = iemMemCommitAndUnmap(pVCpu, pvMem512, IEM_ACCESS_DATA_R);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    iemHlpUsedFpu(pVCpu);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'XSAVE'.
 *
 * @param   iEffSeg         The effective segment.
 * @param   GCPtrEff        The address of the image.
 * @param   enmEffOpSize    The operand size (only REX.W really matters).
 */
IEM_CIMPL_DEF_3(iemCImpl_xsave, uint8_t, iEffSeg, RTGCPTR, GCPtrEff, IEMMODE, enmEffOpSize)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87 | CPUMCTX_EXTRN_SSE_AVX | CPUMCTX_EXTRN_OTHER_XSAVE | CPUMCTX_EXTRN_XCRx);

    /*
     * Raise exceptions.
     */
    if (!(pVCpu->cpum.GstCtx.cr4 & X86_CR4_OSXSAVE))
        return iemRaiseUndefinedOpcode(pVCpu);
    /* When in VMX non-root mode and XSAVE/XRSTOR is not enabled, it results in #UD. */
    if (    IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && !IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_XSAVES_XRSTORS))
    {
        Log(("xrstor: Not enabled for nested-guest execution -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (pVCpu->cpum.GstCtx.cr0 & X86_CR0_TS)
        return iemRaiseDeviceNotAvailable(pVCpu);

    /*
     * Calc the requested mask.
     */
    uint64_t const fReqComponents = RT_MAKE_U64(pVCpu->cpum.GstCtx.eax, pVCpu->cpum.GstCtx.edx) & pVCpu->cpum.GstCtx.aXcr[0];
    AssertLogRelReturn(!(fReqComponents & ~(XSAVE_C_X87 | XSAVE_C_SSE | XSAVE_C_YMM)), VERR_IEM_ASPECT_NOT_IMPLEMENTED);
    uint64_t const fXInUse        = pVCpu->cpum.GstCtx.aXcr[0];

/** @todo figure out the exact protocol for the memory access.  Currently we
 *        just need this crap to work halfways to make it possible to test
 *        AVX instructions. */
/** @todo figure out the XINUSE and XMODIFIED   */

    /*
     * Access the x87 memory state.
     */
    /* The x87+SSE state.  */
    void *pvMem512;
    VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, &pvMem512, 512, iEffSeg, GCPtrEff, IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE,
                                      63 | IEM_MEMMAP_F_ALIGN_GP | IEM_MEMMAP_F_ALIGN_GP_OR_AC);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;
    PX86FXSTATE  pDst = (PX86FXSTATE)pvMem512;
    PCX86FXSTATE pSrc = &pVCpu->cpum.GstCtx.XState.x87;

    /* The header.  */
    PX86XSAVEHDR pHdr;
    rcStrict = iemMemMap(pVCpu, (void **)&pHdr, sizeof(&pHdr), iEffSeg, GCPtrEff + 512, IEM_ACCESS_DATA_RW, 0 /* checked above */);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /*
     * Store the X87 state.
     */
    if (fReqComponents & XSAVE_C_X87)
    {
        /* common for all formats */
        pDst->FCW    = pSrc->FCW;
        pDst->FSW    = pSrc->FSW;
        pDst->FTW    = pSrc->FTW & UINT16_C(0xff);
        pDst->FOP    = pSrc->FOP;
        pDst->FPUIP  = pSrc->FPUIP;
        pDst->CS     = pSrc->CS;
        pDst->FPUDP  = pSrc->FPUDP;
        pDst->DS     = pSrc->DS;
        if (enmEffOpSize == IEMMODE_64BIT)
        {
            /* Save upper 16-bits of FPUIP (IP:CS:Rsvd1) and FPUDP (DP:DS:Rsvd2). */
            pDst->Rsrvd1 = pSrc->Rsrvd1;
            pDst->Rsrvd2 = pSrc->Rsrvd2;
        }
        else
        {
            pDst->Rsrvd1 = 0;
            pDst->Rsrvd2 = 0;
        }
        for (uint32_t i = 0; i < RT_ELEMENTS(pDst->aRegs); i++)
        {
            /** @todo Testcase: What actually happens to the 6 reserved bytes? I'm clearing
             *        them for now... */
            pDst->aRegs[i].au32[0] = pSrc->aRegs[i].au32[0];
            pDst->aRegs[i].au32[1] = pSrc->aRegs[i].au32[1];
            pDst->aRegs[i].au32[2] = pSrc->aRegs[i].au32[2] & UINT32_C(0xffff);
            pDst->aRegs[i].au32[3] = 0;
        }

    }

    if (fReqComponents & (XSAVE_C_SSE | XSAVE_C_YMM))
    {
        pDst->MXCSR         = pSrc->MXCSR;
        pDst->MXCSR_MASK    = CPUMGetGuestMxCsrMask(pVCpu->CTX_SUFF(pVM));
    }

    if (fReqComponents & XSAVE_C_SSE)
    {
        /* XMM registers. */
        uint32_t cXmmRegs = enmEffOpSize == IEMMODE_64BIT ? 16 : 8;
        for (uint32_t i = 0; i < cXmmRegs; i++)
            pDst->aXMM[i] = pSrc->aXMM[i];
        /** @todo Testcase: What happens to the reserved XMM registers? Untouched,
         *        right? */
    }

    /* Commit the x87 state bits. (probably wrong) */
    rcStrict = iemMemCommitAndUnmap(pVCpu, pvMem512, IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /*
     * Store AVX state.
     */
    if (fReqComponents & XSAVE_C_YMM)
    {
        /** @todo testcase: xsave64 vs xsave32 wrt XSAVE_C_YMM. */
        AssertLogRelReturn(pVCpu->cpum.GstCtx.aoffXState[XSAVE_C_YMM_BIT] != UINT16_MAX, VERR_IEM_IPE_9);
        PCX86XSAVEYMMHI pCompSrc = CPUMCTX_XSAVE_C_PTR(IEM_GET_CTX(pVCpu), XSAVE_C_YMM_BIT, PCX86XSAVEYMMHI);
        PX86XSAVEYMMHI  pCompDst;
        rcStrict = iemMemMap(pVCpu, (void **)&pCompDst, sizeof(*pCompDst), iEffSeg, GCPtrEff + pVCpu->cpum.GstCtx.aoffXState[XSAVE_C_YMM_BIT],
                             IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE, 0 /* checked above */);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;

        uint32_t cXmmRegs = enmEffOpSize == IEMMODE_64BIT ? 16 : 8;
        for (uint32_t i = 0; i < cXmmRegs; i++)
            pCompDst->aYmmHi[i] = pCompSrc->aYmmHi[i];

        rcStrict = iemMemCommitAndUnmap(pVCpu, pCompDst, IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE);
        if (rcStrict != VINF_SUCCESS)
            return rcStrict;
    }

    /*
     * Update the header.
     */
    pHdr->bmXState = (pHdr->bmXState & ~fReqComponents)
                   | (fReqComponents & fXInUse);

    rcStrict = iemMemCommitAndUnmap(pVCpu, pHdr, IEM_ACCESS_DATA_RW);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'XRSTOR'.
 *
 * @param   iEffSeg         The effective segment.
 * @param   GCPtrEff        The address of the image.
 * @param   enmEffOpSize    The operand size (only REX.W really matters).
 */
IEM_CIMPL_DEF_3(iemCImpl_xrstor, uint8_t, iEffSeg, RTGCPTR, GCPtrEff, IEMMODE, enmEffOpSize)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87 | CPUMCTX_EXTRN_SSE_AVX | CPUMCTX_EXTRN_OTHER_XSAVE | CPUMCTX_EXTRN_XCRx);

    /*
     * Raise exceptions.
     */
    if (!(pVCpu->cpum.GstCtx.cr4 & X86_CR4_OSXSAVE))
        return iemRaiseUndefinedOpcode(pVCpu);
    /* When in VMX non-root mode and XSAVE/XRSTOR is not enabled, it results in #UD. */
    if (    IEM_VMX_IS_NON_ROOT_MODE(pVCpu)
        && !IEM_VMX_IS_PROCCTLS2_SET(pVCpu, VMX_PROC_CTLS2_XSAVES_XRSTORS))
    {
        Log(("xrstor: Not enabled for nested-guest execution -> #UD\n"));
        return iemRaiseUndefinedOpcode(pVCpu);
    }
    if (pVCpu->cpum.GstCtx.cr0 & X86_CR0_TS)
        return iemRaiseDeviceNotAvailable(pVCpu);
    if (GCPtrEff & 63)
    {
        /** @todo CPU/VM detection possible! \#AC might not be signal for
         * all/any misalignment sizes, intel says its an implementation detail. */
        if (   (pVCpu->cpum.GstCtx.cr0 & X86_CR0_AM)
            && pVCpu->cpum.GstCtx.eflags.Bits.u1AC
            && pVCpu->iem.s.uCpl == 3)
            return iemRaiseAlignmentCheckException(pVCpu);
        return iemRaiseGeneralProtectionFault0(pVCpu);
    }

/** @todo figure out the exact protocol for the memory access.  Currently we
 *        just need this crap to work halfways to make it possible to test
 *        AVX instructions. */
/** @todo figure out the XINUSE and XMODIFIED   */

    /*
     * Access the x87 memory state.
     */
    /* The x87+SSE state.  */
    void *pvMem512;
    VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, &pvMem512, 512, iEffSeg, GCPtrEff, IEM_ACCESS_DATA_R,
                                      63 | IEM_MEMMAP_F_ALIGN_GP | IEM_MEMMAP_F_ALIGN_GP_OR_AC);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;
    PCX86FXSTATE pSrc = (PCX86FXSTATE)pvMem512;
    PX86FXSTATE  pDst = &pVCpu->cpum.GstCtx.XState.x87;

    /*
     * Calc the requested mask
     */
    PX86XSAVEHDR  pHdrDst = &pVCpu->cpum.GstCtx.XState.Hdr;
    PCX86XSAVEHDR pHdrSrc;
    rcStrict = iemMemMap(pVCpu, (void **)&pHdrSrc, sizeof(&pHdrSrc), iEffSeg, GCPtrEff + 512,
                         IEM_ACCESS_DATA_R, 0 /* checked above */);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    uint64_t const fReqComponents = RT_MAKE_U64(pVCpu->cpum.GstCtx.eax, pVCpu->cpum.GstCtx.edx) & pVCpu->cpum.GstCtx.aXcr[0];
    AssertLogRelReturn(!(fReqComponents & ~(XSAVE_C_X87 | XSAVE_C_SSE | XSAVE_C_YMM)), VERR_IEM_ASPECT_NOT_IMPLEMENTED);
    //uint64_t const fXInUse        = pVCpu->cpum.GstCtx.aXcr[0];
    uint64_t const fRstorMask     = pHdrSrc->bmXState;
    uint64_t const fCompMask      = pHdrSrc->bmXComp;

    AssertLogRelReturn(!(fCompMask & XSAVE_C_X), VERR_IEM_ASPECT_NOT_IMPLEMENTED);

    uint32_t const cXmmRegs = enmEffOpSize == IEMMODE_64BIT ? 16 : 8;

    /* We won't need this any longer. */
    rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)pHdrSrc, IEM_ACCESS_DATA_R);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /*
     * Load the X87 state.
     */
    if (fReqComponents & XSAVE_C_X87)
    {
        if (fRstorMask & XSAVE_C_X87)
        {
            pDst->FCW    = pSrc->FCW;
            pDst->FSW    = pSrc->FSW;
            pDst->FTW    = pSrc->FTW & UINT16_C(0xff);
            pDst->FOP    = pSrc->FOP;
            pDst->FPUIP  = pSrc->FPUIP;
            pDst->CS     = pSrc->CS;
            pDst->FPUDP  = pSrc->FPUDP;
            pDst->DS     = pSrc->DS;
            if (enmEffOpSize == IEMMODE_64BIT)
            {
                /* Load upper 16-bits of FPUIP (IP:CS:Rsvd1) and FPUDP (DP:DS:Rsvd2). */
                pDst->Rsrvd1 = pSrc->Rsrvd1;
                pDst->Rsrvd2 = pSrc->Rsrvd2;
            }
            else
            {
                pDst->Rsrvd1 = 0;
                pDst->Rsrvd2 = 0;
            }
            for (uint32_t i = 0; i < RT_ELEMENTS(pDst->aRegs); i++)
            {
                pDst->aRegs[i].au32[0] = pSrc->aRegs[i].au32[0];
                pDst->aRegs[i].au32[1] = pSrc->aRegs[i].au32[1];
                pDst->aRegs[i].au32[2] = pSrc->aRegs[i].au32[2] & UINT32_C(0xffff);
                pDst->aRegs[i].au32[3] = 0;
            }

            pDst->FCW &= ~X86_FCW_ZERO_MASK | X86_FCW_IC_MASK; /* Intel 10980xe allows setting the IC bit. Win 3.11 CALC.EXE sets it. */
            iemFpuRecalcExceptionStatus(pDst);

            if (pDst->FSW & X86_FSW_ES)
                Log11(("xrstor: %04x:%08RX64: loading state with pending FPU exception (FSW=%#x)\n",
                       pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pSrc->FSW));
        }
        else
        {
            pDst->FCW   = 0x37f;
            pDst->FSW   = 0;
            pDst->FTW   = 0x00;         /* 0 - empty. */
            pDst->FPUDP = 0;
            pDst->DS    = 0; //??
            pDst->Rsrvd2= 0;
            pDst->FPUIP = 0;
            pDst->CS    = 0; //??
            pDst->Rsrvd1= 0;
            pDst->FOP   = 0;
            for (uint32_t i = 0; i < RT_ELEMENTS(pSrc->aRegs); i++)
            {
                pDst->aRegs[i].au32[0] = 0;
                pDst->aRegs[i].au32[1] = 0;
                pDst->aRegs[i].au32[2] = 0;
                pDst->aRegs[i].au32[3] = 0;
            }
        }
        pHdrDst->bmXState |= XSAVE_C_X87; /* playing safe for now */
    }

    /* MXCSR */
    if (fReqComponents & (XSAVE_C_SSE | XSAVE_C_YMM))
    {
        if (fRstorMask & (XSAVE_C_SSE | XSAVE_C_YMM))
            pDst->MXCSR = pSrc->MXCSR;
        else
            pDst->MXCSR = 0x1f80;
    }

    /* XMM registers. */
    if (fReqComponents & XSAVE_C_SSE)
    {
        if (fRstorMask & XSAVE_C_SSE)
        {
            for (uint32_t i = 0; i < cXmmRegs; i++)
                pDst->aXMM[i] = pSrc->aXMM[i];
            /** @todo Testcase: What happens to the reserved XMM registers? Untouched,
             *        right? */
        }
        else
        {
            for (uint32_t i = 0; i < cXmmRegs; i++)
            {
                pDst->aXMM[i].au64[0] = 0;
                pDst->aXMM[i].au64[1] = 0;
            }
        }
        pHdrDst->bmXState |= XSAVE_C_SSE; /* playing safe for now */
    }

    /* Unmap the x87 state bits (so we've don't run out of mapping). */
    rcStrict = iemMemCommitAndUnmap(pVCpu, pvMem512, IEM_ACCESS_DATA_R);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /*
     * Restore AVX state.
     */
    if (fReqComponents & XSAVE_C_YMM)
    {
        AssertLogRelReturn(pVCpu->cpum.GstCtx.aoffXState[XSAVE_C_YMM_BIT] != UINT16_MAX, VERR_IEM_IPE_9);
        PX86XSAVEYMMHI  pCompDst = CPUMCTX_XSAVE_C_PTR(IEM_GET_CTX(pVCpu), XSAVE_C_YMM_BIT, PX86XSAVEYMMHI);

        if (fRstorMask & XSAVE_C_YMM)
        {
            /** @todo testcase: xsave64 vs xsave32 wrt XSAVE_C_YMM. */
            PCX86XSAVEYMMHI pCompSrc;
            rcStrict = iemMemMap(pVCpu, (void **)&pCompSrc, sizeof(*pCompDst),
                                 iEffSeg, GCPtrEff + pVCpu->cpum.GstCtx.aoffXState[XSAVE_C_YMM_BIT],
                                 IEM_ACCESS_DATA_R, 0 /* checked above */);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;

            for (uint32_t i = 0; i < cXmmRegs; i++)
            {
                pCompDst->aYmmHi[i].au64[0] = pCompSrc->aYmmHi[i].au64[0];
                pCompDst->aYmmHi[i].au64[1] = pCompSrc->aYmmHi[i].au64[1];
            }

            rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)pCompSrc, IEM_ACCESS_DATA_R);
            if (rcStrict != VINF_SUCCESS)
                return rcStrict;
        }
        else
        {
            for (uint32_t i = 0; i < cXmmRegs; i++)
            {
                pCompDst->aYmmHi[i].au64[0] = 0;
                pCompDst->aYmmHi[i].au64[1] = 0;
            }
        }
        pHdrDst->bmXState |= XSAVE_C_YMM; /* playing safe for now */
    }

    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'STMXCSR'.
 *
 * @param   iEffSeg         The effective segment register for @a GCPtrEff.
 * @param   GCPtrEff        The address of the image.
 */
IEM_CIMPL_DEF_2(iemCImpl_stmxcsr, uint8_t, iEffSeg, RTGCPTR, GCPtrEff)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87 | CPUMCTX_EXTRN_SSE_AVX);

    /*
     * Raise exceptions.
     */
    if (   !(pVCpu->cpum.GstCtx.cr0 & X86_CR0_EM)
        && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_OSFXSR))
    {
        if (!(pVCpu->cpum.GstCtx.cr0 & X86_CR0_TS))
        {
            /*
             * Do the job.
             */
            VBOXSTRICTRC rcStrict = iemMemStoreDataU32(pVCpu, iEffSeg, GCPtrEff, pVCpu->cpum.GstCtx.XState.x87.MXCSR);
            if (rcStrict == VINF_SUCCESS)
                return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
            return rcStrict;
        }
        return iemRaiseDeviceNotAvailable(pVCpu);
    }
    return iemRaiseUndefinedOpcode(pVCpu);
}


/**
 * Implements 'VSTMXCSR'.
 *
 * @param   iEffSeg         The effective segment register for @a GCPtrEff.
 * @param   GCPtrEff        The address of the image.
 */
IEM_CIMPL_DEF_2(iemCImpl_vstmxcsr, uint8_t, iEffSeg, RTGCPTR, GCPtrEff)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87 | CPUMCTX_EXTRN_SSE_AVX | CPUMCTX_EXTRN_XCRx);

    /*
     * Raise exceptions.
     */
    if (   (   !IEM_IS_GUEST_CPU_AMD(pVCpu)
            ? (pVCpu->cpum.GstCtx.aXcr[0] & (XSAVE_C_SSE | XSAVE_C_YMM)) == (XSAVE_C_SSE | XSAVE_C_YMM)
            : !(pVCpu->cpum.GstCtx.cr0 & X86_CR0_EM)) /* AMD Jaguar CPU (f0x16,m0,s1) behaviour */
        && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_OSXSAVE))
    {
        if (!(pVCpu->cpum.GstCtx.cr0 & X86_CR0_TS))
        {
            /*
             * Do the job.
             */
            VBOXSTRICTRC rcStrict = iemMemStoreDataU32(pVCpu, iEffSeg, GCPtrEff, pVCpu->cpum.GstCtx.XState.x87.MXCSR);
            if (rcStrict == VINF_SUCCESS)
                return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
            return rcStrict;
        }
        return iemRaiseDeviceNotAvailable(pVCpu);
    }
    return iemRaiseUndefinedOpcode(pVCpu);
}


/**
 * Implements 'LDMXCSR'.
 *
 * @param   iEffSeg         The effective segment register for @a GCPtrEff.
 * @param   GCPtrEff        The address of the image.
 */
IEM_CIMPL_DEF_2(iemCImpl_ldmxcsr, uint8_t, iEffSeg, RTGCPTR, GCPtrEff)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87 | CPUMCTX_EXTRN_SSE_AVX);

    /*
     * Raise exceptions.
     */
    /** @todo testcase - order of LDMXCSR faults.  Does \#PF, \#GP and \#SS
     *        happen after or before \#UD and \#EM? */
    if (   !(pVCpu->cpum.GstCtx.cr0 & X86_CR0_EM)
        && (pVCpu->cpum.GstCtx.cr4 & X86_CR4_OSFXSR))
    {
        if (!(pVCpu->cpum.GstCtx.cr0 & X86_CR0_TS))
        {
            /*
             * Do the job.
             */
            uint32_t fNewMxCsr;
            VBOXSTRICTRC rcStrict = iemMemFetchDataU32(pVCpu, &fNewMxCsr, iEffSeg, GCPtrEff);
            if (rcStrict == VINF_SUCCESS)
            {
                uint32_t const fMxCsrMask = CPUMGetGuestMxCsrMask(pVCpu->CTX_SUFF(pVM));
                if (!(fNewMxCsr & ~fMxCsrMask))
                {
                    pVCpu->cpum.GstCtx.XState.x87.MXCSR = fNewMxCsr;
                    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
                }
                Log(("ldmxcsr: New MXCSR=%#RX32 & ~MASK=%#RX32 = %#RX32 -> #GP(0)\n",
                     fNewMxCsr, fMxCsrMask, fNewMxCsr & ~fMxCsrMask));
                return iemRaiseGeneralProtectionFault0(pVCpu);
            }
            return rcStrict;
        }
        return iemRaiseDeviceNotAvailable(pVCpu);
    }
    return iemRaiseUndefinedOpcode(pVCpu);
}


/**
 * Commmon routine for fnstenv and fnsave.
 *
 * @param   pVCpu           The cross context virtual CPU structure of the calling thread.
 * @param   enmEffOpSize    The effective operand size.
 * @param   uPtr            Where to store the state.
 */
static void iemCImplCommonFpuStoreEnv(PVMCPUCC pVCpu, IEMMODE enmEffOpSize, RTPTRUNION uPtr)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87);
    PCX86FXSTATE pSrcX87 = &pVCpu->cpum.GstCtx.XState.x87;
    if (enmEffOpSize == IEMMODE_16BIT)
    {
        uPtr.pu16[0] = pSrcX87->FCW;
        uPtr.pu16[1] = pSrcX87->FSW;
        uPtr.pu16[2] = iemFpuCalcFullFtw(pSrcX87);
        if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
        {
            /** @todo Testcase: How does this work when the FPUIP/CS was saved in
             *        protected mode or long mode and we save it in real mode?  And vice
             *        versa?  And with 32-bit operand size?  I think CPU is storing the
             *        effective address ((CS << 4) + IP) in the offset register and not
             *        doing any address calculations here. */
            uPtr.pu16[3] = (uint16_t)pSrcX87->FPUIP;
            uPtr.pu16[4] = ((pSrcX87->FPUIP >> 4) & UINT16_C(0xf000)) | pSrcX87->FOP;
            uPtr.pu16[5] = (uint16_t)pSrcX87->FPUDP;
            uPtr.pu16[6] = (pSrcX87->FPUDP  >> 4) & UINT16_C(0xf000);
        }
        else
        {
            uPtr.pu16[3] = pSrcX87->FPUIP;
            uPtr.pu16[4] = pSrcX87->CS;
            uPtr.pu16[5] = pSrcX87->FPUDP;
            uPtr.pu16[6] = pSrcX87->DS;
        }
    }
    else
    {
        /** @todo Testcase: what is stored in the "gray" areas? (figure 8-9 and 8-10) */
        uPtr.pu16[0*2]   = pSrcX87->FCW;
        uPtr.pu16[0*2+1] = 0xffff;  /* (0xffff observed on intel skylake.) */
        uPtr.pu16[1*2]   = pSrcX87->FSW;
        uPtr.pu16[1*2+1] = 0xffff;
        uPtr.pu16[2*2]   = iemFpuCalcFullFtw(pSrcX87);
        uPtr.pu16[2*2+1] = 0xffff;
        if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
        {
            uPtr.pu16[3*2]   = (uint16_t)pSrcX87->FPUIP;
            uPtr.pu32[4]     = ((pSrcX87->FPUIP & UINT32_C(0xffff0000)) >> 4) | pSrcX87->FOP;
            uPtr.pu16[5*2]   = (uint16_t)pSrcX87->FPUDP;
            uPtr.pu32[6]     = (pSrcX87->FPUDP  & UINT32_C(0xffff0000)) >> 4;
        }
        else
        {
            uPtr.pu32[3]     = pSrcX87->FPUIP;
            uPtr.pu16[4*2]   = pSrcX87->CS;
            uPtr.pu16[4*2+1] = pSrcX87->FOP;
            uPtr.pu32[5]     = pSrcX87->FPUDP;
            uPtr.pu16[6*2]   = pSrcX87->DS;
            uPtr.pu16[6*2+1] = 0xffff;
        }
    }
}


/**
 * Commmon routine for fldenv and frstor
 *
 * @param   pVCpu           The cross context virtual CPU structure of the calling thread.
 * @param   enmEffOpSize    The effective operand size.
 * @param   uPtr                Where to store the state.
 */
static void iemCImplCommonFpuRestoreEnv(PVMCPUCC pVCpu, IEMMODE enmEffOpSize, RTCPTRUNION uPtr)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87);
    PX86FXSTATE pDstX87 = &pVCpu->cpum.GstCtx.XState.x87;
    if (enmEffOpSize == IEMMODE_16BIT)
    {
        pDstX87->FCW = uPtr.pu16[0];
        pDstX87->FSW = uPtr.pu16[1];
        pDstX87->FTW = uPtr.pu16[2];
        if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
        {
            pDstX87->FPUIP = uPtr.pu16[3] | ((uint32_t)(uPtr.pu16[4] & UINT16_C(0xf000)) << 4);
            pDstX87->FPUDP = uPtr.pu16[5] | ((uint32_t)(uPtr.pu16[6] & UINT16_C(0xf000)) << 4);
            pDstX87->FOP   = uPtr.pu16[4] & UINT16_C(0x07ff);
            pDstX87->CS    = 0;
            pDstX87->Rsrvd1= 0;
            pDstX87->DS    = 0;
            pDstX87->Rsrvd2= 0;
        }
        else
        {
            pDstX87->FPUIP = uPtr.pu16[3];
            pDstX87->CS    = uPtr.pu16[4];
            pDstX87->Rsrvd1= 0;
            pDstX87->FPUDP = uPtr.pu16[5];
            pDstX87->DS    = uPtr.pu16[6];
            pDstX87->Rsrvd2= 0;
            /** @todo Testcase: Is FOP cleared when doing 16-bit protected mode fldenv? */
        }
    }
    else
    {
        pDstX87->FCW = uPtr.pu16[0*2];
        pDstX87->FSW = uPtr.pu16[1*2];
        pDstX87->FTW = uPtr.pu16[2*2];
        if (IEM_IS_REAL_OR_V86_MODE(pVCpu))
        {
            pDstX87->FPUIP = uPtr.pu16[3*2] | ((uPtr.pu32[4] & UINT32_C(0x0ffff000)) << 4);
            pDstX87->FOP   = uPtr.pu32[4] & UINT16_C(0x07ff);
            pDstX87->FPUDP = uPtr.pu16[5*2] | ((uPtr.pu32[6] & UINT32_C(0x0ffff000)) << 4);
            pDstX87->CS    = 0;
            pDstX87->Rsrvd1= 0;
            pDstX87->DS    = 0;
            pDstX87->Rsrvd2= 0;
        }
        else
        {
            pDstX87->FPUIP = uPtr.pu32[3];
            pDstX87->CS    = uPtr.pu16[4*2];
            pDstX87->Rsrvd1= 0;
            pDstX87->FOP   = uPtr.pu16[4*2+1];
            pDstX87->FPUDP = uPtr.pu32[5];
            pDstX87->DS    = uPtr.pu16[6*2];
            pDstX87->Rsrvd2= 0;
        }
    }

    /* Make adjustments. */
    pDstX87->FTW = iemFpuCompressFtw(pDstX87->FTW);
#ifdef LOG_ENABLED
    uint16_t const fOldFsw = pDstX87->FSW;
#endif
    pDstX87->FCW &= ~X86_FCW_ZERO_MASK | X86_FCW_IC_MASK; /* Intel 10980xe allows setting the IC bit. Win 3.11 CALC.EXE sets it. */
    iemFpuRecalcExceptionStatus(pDstX87);
#ifdef LOG_ENABLED
    if ((pDstX87->FSW & X86_FSW_ES) ^ (fOldFsw & X86_FSW_ES))
        Log11(("iemCImplCommonFpuRestoreEnv: %04x:%08RX64: %s FPU exception (FCW=%#x FSW=%#x -> %#x)\n",
               pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, fOldFsw & X86_FSW_ES ? "Supressed" : "Raised",
               pDstX87->FCW, fOldFsw, pDstX87->FSW));
#endif

    /** @todo Testcase: Check if ES and/or B are automatically cleared if no
     *        exceptions are pending after loading the saved state? */
}


/**
 * Implements 'FNSTENV'.
 *
 * @param   enmEffOpSize    The operand size (only REX.W really matters).
 * @param   iEffSeg         The effective segment register for @a GCPtrEffDst.
 * @param   GCPtrEffDst     The address of the image.
 */
IEM_CIMPL_DEF_3(iemCImpl_fnstenv, IEMMODE, enmEffOpSize, uint8_t, iEffSeg, RTGCPTR, GCPtrEffDst)
{
    RTPTRUNION   uPtr;
    VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, &uPtr.pv, enmEffOpSize == IEMMODE_16BIT ? 14 : 28,
                                      iEffSeg, GCPtrEffDst, IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE,
                                      enmEffOpSize == IEMMODE_16BIT ? 1 : 3 /** @todo ? */);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    iemCImplCommonFpuStoreEnv(pVCpu, enmEffOpSize, uPtr);

    rcStrict = iemMemCommitAndUnmap(pVCpu, uPtr.pv, IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Mask all math exceptions. Any possibly pending exceptions will be cleared. */
    PX86FXSTATE pFpuCtx = &pVCpu->cpum.GstCtx.XState.x87;
    pFpuCtx->FCW |= X86_FCW_XCPT_MASK;
#ifdef LOG_ENABLED
    uint16_t fOldFsw = pFpuCtx->FSW;
#endif
    iemFpuRecalcExceptionStatus(pFpuCtx);
#ifdef LOG_ENABLED
    if ((pFpuCtx->FSW & X86_FSW_ES) ^ (fOldFsw & X86_FSW_ES))
        Log11(("fnstenv: %04x:%08RX64: %s FPU exception (FCW=%#x, FSW %#x -> %#x)\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip,
               fOldFsw & X86_FSW_ES ? "Supressed" : "Raised", pFpuCtx->FCW, fOldFsw, pFpuCtx->FSW));
#endif

    iemHlpUsedFpu(pVCpu);

    /* Note: C0, C1, C2 and C3 are documented as undefined, we leave them untouched! */
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'FNSAVE'.
 *
 * @param   enmEffOpSize    The operand size.
 * @param   iEffSeg         The effective segment register for @a GCPtrEffDst.
 * @param   GCPtrEffDst     The address of the image.
 */
IEM_CIMPL_DEF_3(iemCImpl_fnsave, IEMMODE, enmEffOpSize, uint8_t, iEffSeg, RTGCPTR, GCPtrEffDst)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87);

    RTPTRUNION   uPtr;
    VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, &uPtr.pv, enmEffOpSize == IEMMODE_16BIT ? 94 : 108,
                                      iEffSeg, GCPtrEffDst, IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE, 3 /** @todo ? */);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    PX86FXSTATE pFpuCtx = &pVCpu->cpum.GstCtx.XState.x87;
    iemCImplCommonFpuStoreEnv(pVCpu, enmEffOpSize, uPtr);
    PRTFLOAT80U paRegs = (PRTFLOAT80U)(uPtr.pu8 + (enmEffOpSize == IEMMODE_16BIT ? 14 : 28));
    for (uint32_t i = 0; i < RT_ELEMENTS(pFpuCtx->aRegs); i++)
    {
        paRegs[i].au32[0] = pFpuCtx->aRegs[i].au32[0];
        paRegs[i].au32[1] = pFpuCtx->aRegs[i].au32[1];
        paRegs[i].au16[4] = pFpuCtx->aRegs[i].au16[4];
    }

    rcStrict = iemMemCommitAndUnmap(pVCpu, uPtr.pv, IEM_ACCESS_DATA_W | IEM_ACCESS_PARTIAL_WRITE);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    /* Rotate the stack to account for changed TOS. */
    iemFpuRotateStackSetTop(pFpuCtx, 0);

    /*
     * Re-initialize the FPU context.
     */
    pFpuCtx->FCW   = 0x37f;
    pFpuCtx->FSW   = 0;
    pFpuCtx->FTW   = 0x00;       /* 0 - empty */
    pFpuCtx->FPUDP = 0;
    pFpuCtx->DS    = 0;
    pFpuCtx->Rsrvd2= 0;
    pFpuCtx->FPUIP = 0;
    pFpuCtx->CS    = 0;
    pFpuCtx->Rsrvd1= 0;
    pFpuCtx->FOP   = 0;

    iemHlpUsedFpu(pVCpu);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'FLDENV'.
 *
 * @param   enmEffOpSize    The operand size (only REX.W really matters).
 * @param   iEffSeg         The effective segment register for @a GCPtrEffSrc.
 * @param   GCPtrEffSrc     The address of the image.
 */
IEM_CIMPL_DEF_3(iemCImpl_fldenv, IEMMODE, enmEffOpSize, uint8_t, iEffSeg, RTGCPTR, GCPtrEffSrc)
{
    RTCPTRUNION  uPtr;
    VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, (void **)&uPtr.pv, enmEffOpSize == IEMMODE_16BIT ? 14 : 28,
                                      iEffSeg, GCPtrEffSrc, IEM_ACCESS_DATA_R,
                                      enmEffOpSize == IEMMODE_16BIT ? 1 : 3 /** @todo ?*/);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    iemCImplCommonFpuRestoreEnv(pVCpu, enmEffOpSize, uPtr);

    rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)uPtr.pv, IEM_ACCESS_DATA_R);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    iemHlpUsedFpu(pVCpu);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'FRSTOR'.
 *
 * @param   enmEffOpSize    The operand size.
 * @param   iEffSeg         The effective segment register for @a GCPtrEffSrc.
 * @param   GCPtrEffSrc     The address of the image.
 */
IEM_CIMPL_DEF_3(iemCImpl_frstor, IEMMODE, enmEffOpSize, uint8_t, iEffSeg, RTGCPTR, GCPtrEffSrc)
{
    RTCPTRUNION  uPtr;
    VBOXSTRICTRC rcStrict = iemMemMap(pVCpu, (void **)&uPtr.pv, enmEffOpSize == IEMMODE_16BIT ? 94 : 108,
                                      iEffSeg, GCPtrEffSrc, IEM_ACCESS_DATA_R, 3 /** @todo ?*/ );
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    PX86FXSTATE pFpuCtx = &pVCpu->cpum.GstCtx.XState.x87;
    iemCImplCommonFpuRestoreEnv(pVCpu, enmEffOpSize, uPtr);
    PCRTFLOAT80U paRegs = (PCRTFLOAT80U)(uPtr.pu8 + (enmEffOpSize == IEMMODE_16BIT ? 14 : 28));
    for (uint32_t i = 0; i < RT_ELEMENTS(pFpuCtx->aRegs); i++)
    {
        pFpuCtx->aRegs[i].au32[0] = paRegs[i].au32[0];
        pFpuCtx->aRegs[i].au32[1] = paRegs[i].au32[1];
        pFpuCtx->aRegs[i].au32[2] = paRegs[i].au16[4];
        pFpuCtx->aRegs[i].au32[3] = 0;
    }

    rcStrict = iemMemCommitAndUnmap(pVCpu, (void *)uPtr.pv, IEM_ACCESS_DATA_R);
    if (rcStrict != VINF_SUCCESS)
        return rcStrict;

    iemHlpUsedFpu(pVCpu);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'FLDCW'.
 *
 * @param   u16Fcw          The new FCW.
 */
IEM_CIMPL_DEF_1(iemCImpl_fldcw, uint16_t, u16Fcw)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87);

    /** @todo Testcase: Check what happens when trying to load X86_FCW_PC_RSVD. */
    /** @todo Testcase: Try see what happens when trying to set undefined bits
     *        (other than 6 and 7).  Currently ignoring them. */
    /** @todo Testcase: Test that it raises and loweres the FPU exception bits
     *        according to FSW. (This is what is currently implemented.) */
    PX86FXSTATE pFpuCtx = &pVCpu->cpum.GstCtx.XState.x87;
    pFpuCtx->FCW = u16Fcw & (~X86_FCW_ZERO_MASK | X86_FCW_IC_MASK); /* Intel 10980xe allows setting the IC bit. Win 3.11 CALC.EXE sets it. */
#ifdef LOG_ENABLED
    uint16_t fOldFsw = pFpuCtx->FSW;
#endif
    iemFpuRecalcExceptionStatus(pFpuCtx);
#ifdef LOG_ENABLED
    if ((pFpuCtx->FSW & X86_FSW_ES) ^ (fOldFsw & X86_FSW_ES))
        Log11(("fldcw: %04x:%08RX64: %s FPU exception (FCW=%#x, FSW %#x -> %#x)\n", pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip,
               fOldFsw & X86_FSW_ES ? "Supressed" : "Raised", pFpuCtx->FCW, fOldFsw, pFpuCtx->FSW));
#endif

    /* Note: C0, C1, C2 and C3 are documented as undefined, we leave them untouched! */
    iemHlpUsedFpu(pVCpu);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements the underflow case of fxch.
 *
 * @param   iStReg              The other stack register.
 */
IEM_CIMPL_DEF_1(iemCImpl_fxch_underflow, uint8_t, iStReg)
{
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87);

    PX86FXSTATE pFpuCtx = &pVCpu->cpum.GstCtx.XState.x87;
    unsigned const iReg1 = X86_FSW_TOP_GET(pFpuCtx->FSW);
    unsigned const iReg2 = (iReg1 + iStReg) & X86_FSW_TOP_SMASK;
    Assert(!(RT_BIT(iReg1) & pFpuCtx->FTW) || !(RT_BIT(iReg2) & pFpuCtx->FTW));

    /** @todo Testcase: fxch underflow. Making assumptions that underflowed
     *        registers are read as QNaN and then exchanged. This could be
     *        wrong... */
    if (pFpuCtx->FCW & X86_FCW_IM)
    {
        if (RT_BIT(iReg1) & pFpuCtx->FTW)
        {
            if (RT_BIT(iReg2) & pFpuCtx->FTW)
                iemFpuStoreQNan(&pFpuCtx->aRegs[0].r80);
            else
                pFpuCtx->aRegs[0].r80 = pFpuCtx->aRegs[iStReg].r80;
            iemFpuStoreQNan(&pFpuCtx->aRegs[iStReg].r80);
        }
        else
        {
            pFpuCtx->aRegs[iStReg].r80 = pFpuCtx->aRegs[0].r80;
            iemFpuStoreQNan(&pFpuCtx->aRegs[0].r80);
        }
        pFpuCtx->FSW &= ~X86_FSW_C_MASK;
        pFpuCtx->FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF;
    }
    else
    {
        /* raise underflow exception, don't change anything. */
        pFpuCtx->FSW &= ~(X86_FSW_TOP_MASK | X86_FSW_XCPT_MASK);
        pFpuCtx->FSW |= X86_FSW_C1 | X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
        Log11(("fxch: %04x:%08RX64: Underflow exception (FSW=%#x)\n",
               pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pFpuCtx->FSW));
    }

    iemFpuUpdateOpcodeAndIpWorker(pVCpu, pFpuCtx);
    iemHlpUsedFpu(pVCpu);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}


/**
 * Implements 'FCOMI', 'FCOMIP', 'FUCOMI', and 'FUCOMIP'.
 *
 * @param   iStReg          The other stack register.
 * @param   pfnAImpl        The assembly comparison implementation.
 * @param   fPop            Whether we should pop the stack when done or not.
 */
IEM_CIMPL_DEF_3(iemCImpl_fcomi_fucomi, uint8_t, iStReg, PFNIEMAIMPLFPUR80EFL, pfnAImpl, bool, fPop)
{
    Assert(iStReg < 8);
    IEM_CTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_X87);

    /*
     * Raise exceptions.
     */
    if (pVCpu->cpum.GstCtx.cr0 & (X86_CR0_EM | X86_CR0_TS))
        return iemRaiseDeviceNotAvailable(pVCpu);

    PX86FXSTATE pFpuCtx = &pVCpu->cpum.GstCtx.XState.x87;
    uint16_t u16Fsw = pFpuCtx->FSW;
    if (u16Fsw & X86_FSW_ES)
        return iemRaiseMathFault(pVCpu);

    /*
     * Check if any of the register accesses causes #SF + #IA.
     */
    unsigned const iReg1 = X86_FSW_TOP_GET(u16Fsw);
    unsigned const iReg2 = (iReg1 + iStReg) & X86_FSW_TOP_SMASK;
    if ((pFpuCtx->FTW & (RT_BIT(iReg1) | RT_BIT(iReg2))) == (RT_BIT(iReg1) | RT_BIT(iReg2)))
    {
        uint32_t u32Eflags = pfnAImpl(pFpuCtx, &u16Fsw, &pFpuCtx->aRegs[0].r80, &pFpuCtx->aRegs[iStReg].r80);

        pFpuCtx->FSW &= ~X86_FSW_C1;
        pFpuCtx->FSW |= u16Fsw & ~X86_FSW_TOP_MASK;
        if (   !(u16Fsw & X86_FSW_IE)
            || (pFpuCtx->FCW & X86_FCW_IM) )
        {
            pVCpu->cpum.GstCtx.eflags.u &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF);
            pVCpu->cpum.GstCtx.eflags.u |= u32Eflags & (X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF);
        }
    }
    else if (pFpuCtx->FCW & X86_FCW_IM)
    {
        /* Masked underflow. */
        pFpuCtx->FSW &= ~X86_FSW_C1;
        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF;
        pVCpu->cpum.GstCtx.eflags.u &= ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF);
        pVCpu->cpum.GstCtx.eflags.u |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF;
    }
    else
    {
        /* Raise underflow - don't touch EFLAGS or TOP. */
        pFpuCtx->FSW &= ~X86_FSW_C1;
        pFpuCtx->FSW |= X86_FSW_IE | X86_FSW_SF | X86_FSW_ES | X86_FSW_B;
        Log11(("fxch: %04x:%08RX64: Raising IE+SF exception (FSW=%#x)\n",
               pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pFpuCtx->FSW));
        fPop = false;
    }

    /*
     * Pop if necessary.
     */
    if (fPop)
    {
        pFpuCtx->FTW &= ~RT_BIT(iReg1);
        iemFpuStackIncTop(pVCpu);
    }

    iemFpuUpdateOpcodeAndIpWorker(pVCpu, pFpuCtx);
    iemHlpUsedFpu(pVCpu);
    return iemRegAddToRipAndFinishingClearingRF(pVCpu, cbInstr);
}

/** @} */