diff options
Diffstat (limited to 'src/VBox/VMM/VMMR3/PDMBlkCache.cpp')
-rw-r--r-- | src/VBox/VMM/VMMR3/PDMBlkCache.cpp | 2805 |
1 files changed, 2805 insertions, 0 deletions
diff --git a/src/VBox/VMM/VMMR3/PDMBlkCache.cpp b/src/VBox/VMM/VMMR3/PDMBlkCache.cpp new file mode 100644 index 00000000..7a0fa776 --- /dev/null +++ b/src/VBox/VMM/VMMR3/PDMBlkCache.cpp @@ -0,0 +1,2805 @@ +/* $Id: PDMBlkCache.cpp $ */ +/** @file + * PDM Block Cache. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + +/** @page pg_pdm_block_cache PDM Block Cache - The I/O cache + * This component implements an I/O cache based on the 2Q cache algorithm. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_PDM_BLK_CACHE +#include "PDMInternal.h" +#include <iprt/asm.h> +#include <iprt/mem.h> +#include <iprt/path.h> +#include <iprt/string.h> +#include <iprt/trace.h> +#include <VBox/log.h> +#include <VBox/vmm/stam.h> +#include <VBox/vmm/uvm.h> +#include <VBox/vmm/vm.h> + +#include "PDMBlkCacheInternal.h" + +#ifdef VBOX_STRICT +# define PDMACFILECACHE_IS_CRITSECT_OWNER(Cache) \ + do \ + { \ + AssertMsg(RTCritSectIsOwner(&Cache->CritSect), \ + ("Thread does not own critical section\n"));\ + } while (0) + +# define PDMACFILECACHE_EP_IS_SEMRW_WRITE_OWNER(pEpCache) \ + do \ + { \ + AssertMsg(RTSemRWIsWriteOwner(pEpCache->SemRWEntries), \ + ("Thread is not exclusive owner of the per endpoint RW semaphore\n")); \ + } while (0) + +# define PDMACFILECACHE_EP_IS_SEMRW_READ_OWNER(pEpCache) \ + do \ + { \ + AssertMsg(RTSemRWIsReadOwner(pEpCache->SemRWEntries), \ + ("Thread is not read owner of the per endpoint RW semaphore\n")); \ + } while (0) + +#else +# define PDMACFILECACHE_IS_CRITSECT_OWNER(Cache) do { } while (0) +# define PDMACFILECACHE_EP_IS_SEMRW_WRITE_OWNER(pEpCache) do { } while (0) +# define PDMACFILECACHE_EP_IS_SEMRW_READ_OWNER(pEpCache) do { } while (0) +#endif + +#define PDM_BLK_CACHE_SAVED_STATE_VERSION 1 + +/* Enable to enable some tracing in the block cache code for investigating issues. */ +/*#define VBOX_BLKCACHE_TRACING 1*/ + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ + +static PPDMBLKCACHEENTRY pdmBlkCacheEntryAlloc(PPDMBLKCACHE pBlkCache, + uint64_t off, size_t cbData, uint8_t *pbBuffer); +static bool pdmBlkCacheAddDirtyEntry(PPDMBLKCACHE pBlkCache, PPDMBLKCACHEENTRY pEntry); + + +/** + * Add message to the VM trace buffer. + * + * @returns nothing. + * @param pBlkCache The block cache. + * @param pszFmt The format string. + * @param ... Additional parameters for the string formatter. + */ +DECLINLINE(void) pdmBlkCacheR3TraceMsgF(PPDMBLKCACHE pBlkCache, const char *pszFmt, ...) +{ +#if defined(VBOX_BLKCACHE_TRACING) + va_list va; + va_start(va, pszFmt); + RTTraceBufAddMsgV(pBlkCache->pCache->pVM->CTX_SUFF(hTraceBuf), pszFmt, va); + va_end(va); +#else + RT_NOREF2(pBlkCache, pszFmt); +#endif +} + +/** + * Decrement the reference counter of the given cache entry. + * + * @returns nothing. + * @param pEntry The entry to release. + */ +DECLINLINE(void) pdmBlkCacheEntryRelease(PPDMBLKCACHEENTRY pEntry) +{ + AssertMsg(pEntry->cRefs > 0, ("Trying to release a not referenced entry\n")); + ASMAtomicDecU32(&pEntry->cRefs); +} + +/** + * Increment the reference counter of the given cache entry. + * + * @returns nothing. + * @param pEntry The entry to reference. + */ +DECLINLINE(void) pdmBlkCacheEntryRef(PPDMBLKCACHEENTRY pEntry) +{ + ASMAtomicIncU32(&pEntry->cRefs); +} + +#ifdef VBOX_STRICT +static void pdmBlkCacheValidate(PPDMBLKCACHEGLOBAL pCache) +{ + /* Amount of cached data should never exceed the maximum amount. */ + AssertMsg(pCache->cbCached <= pCache->cbMax, + ("Current amount of cached data exceeds maximum\n")); + + /* The amount of cached data in the LRU and FRU list should match cbCached */ + AssertMsg(pCache->LruRecentlyUsedIn.cbCached + pCache->LruFrequentlyUsed.cbCached == pCache->cbCached, + ("Amount of cached data doesn't match\n")); + + AssertMsg(pCache->LruRecentlyUsedOut.cbCached <= pCache->cbRecentlyUsedOutMax, + ("Paged out list exceeds maximum\n")); +} +#endif + +DECLINLINE(void) pdmBlkCacheLockEnter(PPDMBLKCACHEGLOBAL pCache) +{ + RTCritSectEnter(&pCache->CritSect); +#ifdef VBOX_STRICT + pdmBlkCacheValidate(pCache); +#endif +} + +DECLINLINE(void) pdmBlkCacheLockLeave(PPDMBLKCACHEGLOBAL pCache) +{ +#ifdef VBOX_STRICT + pdmBlkCacheValidate(pCache); +#endif + RTCritSectLeave(&pCache->CritSect); +} + +DECLINLINE(void) pdmBlkCacheSub(PPDMBLKCACHEGLOBAL pCache, uint32_t cbAmount) +{ + PDMACFILECACHE_IS_CRITSECT_OWNER(pCache); + pCache->cbCached -= cbAmount; +} + +DECLINLINE(void) pdmBlkCacheAdd(PPDMBLKCACHEGLOBAL pCache, uint32_t cbAmount) +{ + PDMACFILECACHE_IS_CRITSECT_OWNER(pCache); + pCache->cbCached += cbAmount; +} + +DECLINLINE(void) pdmBlkCacheListAdd(PPDMBLKLRULIST pList, uint32_t cbAmount) +{ + pList->cbCached += cbAmount; +} + +DECLINLINE(void) pdmBlkCacheListSub(PPDMBLKLRULIST pList, uint32_t cbAmount) +{ + pList->cbCached -= cbAmount; +} + +#ifdef PDMACFILECACHE_WITH_LRULIST_CHECKS +/** + * Checks consistency of a LRU list. + * + * @returns nothing + * @param pList The LRU list to check. + * @param pNotInList Element which is not allowed to occur in the list. + */ +static void pdmBlkCacheCheckList(PPDMBLKLRULIST pList, PPDMBLKCACHEENTRY pNotInList) +{ + PPDMBLKCACHEENTRY pCurr = pList->pHead; + + /* Check that there are no double entries and no cycles in the list. */ + while (pCurr) + { + PPDMBLKCACHEENTRY pNext = pCurr->pNext; + + while (pNext) + { + AssertMsg(pCurr != pNext, + ("Entry %#p is at least two times in list %#p or there is a cycle in the list\n", + pCurr, pList)); + pNext = pNext->pNext; + } + + AssertMsg(pCurr != pNotInList, ("Not allowed entry %#p is in list\n", pCurr)); + + if (!pCurr->pNext) + AssertMsg(pCurr == pList->pTail, ("End of list reached but last element is not list tail\n")); + + pCurr = pCurr->pNext; + } +} +#endif + +/** + * Unlinks a cache entry from the LRU list it is assigned to. + * + * @returns nothing. + * @param pEntry The entry to unlink. + */ +static void pdmBlkCacheEntryRemoveFromList(PPDMBLKCACHEENTRY pEntry) +{ + PPDMBLKLRULIST pList = pEntry->pList; + PPDMBLKCACHEENTRY pPrev, pNext; + + LogFlowFunc((": Deleting entry %#p from list %#p\n", pEntry, pList)); + + AssertPtr(pList); + +#ifdef PDMACFILECACHE_WITH_LRULIST_CHECKS + pdmBlkCacheCheckList(pList, NULL); +#endif + + pPrev = pEntry->pPrev; + pNext = pEntry->pNext; + + AssertMsg(pEntry != pPrev, ("Entry links to itself as previous element\n")); + AssertMsg(pEntry != pNext, ("Entry links to itself as next element\n")); + + if (pPrev) + pPrev->pNext = pNext; + else + { + pList->pHead = pNext; + + if (pNext) + pNext->pPrev = NULL; + } + + if (pNext) + pNext->pPrev = pPrev; + else + { + pList->pTail = pPrev; + + if (pPrev) + pPrev->pNext = NULL; + } + + pEntry->pList = NULL; + pEntry->pPrev = NULL; + pEntry->pNext = NULL; + pdmBlkCacheListSub(pList, pEntry->cbData); +#ifdef PDMACFILECACHE_WITH_LRULIST_CHECKS + pdmBlkCacheCheckList(pList, pEntry); +#endif +} + +/** + * Adds a cache entry to the given LRU list unlinking it from the currently + * assigned list if needed. + * + * @returns nothing. + * @param pList List to the add entry to. + * @param pEntry Entry to add. + */ +static void pdmBlkCacheEntryAddToList(PPDMBLKLRULIST pList, PPDMBLKCACHEENTRY pEntry) +{ + LogFlowFunc((": Adding entry %#p to list %#p\n", pEntry, pList)); +#ifdef PDMACFILECACHE_WITH_LRULIST_CHECKS + pdmBlkCacheCheckList(pList, NULL); +#endif + + /* Remove from old list if needed */ + if (pEntry->pList) + pdmBlkCacheEntryRemoveFromList(pEntry); + + pEntry->pNext = pList->pHead; + if (pList->pHead) + pList->pHead->pPrev = pEntry; + else + { + Assert(!pList->pTail); + pList->pTail = pEntry; + } + + pEntry->pPrev = NULL; + pList->pHead = pEntry; + pdmBlkCacheListAdd(pList, pEntry->cbData); + pEntry->pList = pList; +#ifdef PDMACFILECACHE_WITH_LRULIST_CHECKS + pdmBlkCacheCheckList(pList, NULL); +#endif +} + +/** + * Destroys a LRU list freeing all entries. + * + * @returns nothing + * @param pList Pointer to the LRU list to destroy. + * + * @note The caller must own the critical section of the cache. + */ +static void pdmBlkCacheDestroyList(PPDMBLKLRULIST pList) +{ + while (pList->pHead) + { + PPDMBLKCACHEENTRY pEntry = pList->pHead; + + pList->pHead = pEntry->pNext; + + AssertMsg(!(pEntry->fFlags & (PDMBLKCACHE_ENTRY_IO_IN_PROGRESS | PDMBLKCACHE_ENTRY_IS_DIRTY)), + ("Entry is dirty and/or still in progress fFlags=%#x\n", pEntry->fFlags)); + + RTMemPageFree(pEntry->pbData, pEntry->cbData); + RTMemFree(pEntry); + } +} + +/** + * Tries to remove the given amount of bytes from a given list in the cache + * moving the entries to one of the given ghosts lists + * + * @returns Amount of data which could be freed. + * @param pCache Pointer to the global cache data. + * @param cbData The amount of the data to free. + * @param pListSrc The source list to evict data from. + * @param pGhostListDst Where the ghost list removed entries should be + * moved to, NULL if the entry should be freed. + * @param fReuseBuffer Flag whether a buffer should be reused if it has + * the same size + * @param ppbBuffer Where to store the address of the buffer if an + * entry with the same size was found and + * fReuseBuffer is true. + * + * @note This function may return fewer bytes than requested because entries + * may be marked as non evictable if they are used for I/O at the + * moment. + */ +static size_t pdmBlkCacheEvictPagesFrom(PPDMBLKCACHEGLOBAL pCache, size_t cbData, + PPDMBLKLRULIST pListSrc, PPDMBLKLRULIST pGhostListDst, + bool fReuseBuffer, uint8_t **ppbBuffer) +{ + size_t cbEvicted = 0; + + PDMACFILECACHE_IS_CRITSECT_OWNER(pCache); + + AssertMsg(cbData > 0, ("Evicting 0 bytes not possible\n")); + AssertMsg( !pGhostListDst + || (pGhostListDst == &pCache->LruRecentlyUsedOut), + ("Destination list must be NULL or the recently used but paged out list\n")); + + if (fReuseBuffer) + { + AssertPtr(ppbBuffer); + *ppbBuffer = NULL; + } + + /* Start deleting from the tail. */ + PPDMBLKCACHEENTRY pEntry = pListSrc->pTail; + + while ((cbEvicted < cbData) && pEntry) + { + PPDMBLKCACHEENTRY pCurr = pEntry; + + pEntry = pEntry->pPrev; + + /* We can't evict pages which are currently in progress or dirty but not in progress */ + if ( !(pCurr->fFlags & PDMBLKCACHE_NOT_EVICTABLE) + && (ASMAtomicReadU32(&pCurr->cRefs) == 0)) + { + /* Ok eviction candidate. Grab the endpoint semaphore and check again + * because somebody else might have raced us. */ + PPDMBLKCACHE pBlkCache = pCurr->pBlkCache; + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + + if (!(pCurr->fFlags & PDMBLKCACHE_NOT_EVICTABLE) + && (ASMAtomicReadU32(&pCurr->cRefs) == 0)) + { + LogFlow(("Evicting entry %#p (%u bytes)\n", pCurr, pCurr->cbData)); + + if (fReuseBuffer && pCurr->cbData == cbData) + { + STAM_COUNTER_INC(&pCache->StatBuffersReused); + *ppbBuffer = pCurr->pbData; + } + else if (pCurr->pbData) + RTMemPageFree(pCurr->pbData, pCurr->cbData); + + pCurr->pbData = NULL; + cbEvicted += pCurr->cbData; + + pdmBlkCacheEntryRemoveFromList(pCurr); + pdmBlkCacheSub(pCache, pCurr->cbData); + + if (pGhostListDst) + { + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + PPDMBLKCACHEENTRY pGhostEntFree = pGhostListDst->pTail; + + /* We have to remove the last entries from the paged out list. */ + while ( pGhostListDst->cbCached + pCurr->cbData > pCache->cbRecentlyUsedOutMax + && pGhostEntFree) + { + PPDMBLKCACHEENTRY pFree = pGhostEntFree; + PPDMBLKCACHE pBlkCacheFree = pFree->pBlkCache; + + pGhostEntFree = pGhostEntFree->pPrev; + + RTSemRWRequestWrite(pBlkCacheFree->SemRWEntries, RT_INDEFINITE_WAIT); + + if (ASMAtomicReadU32(&pFree->cRefs) == 0) + { + pdmBlkCacheEntryRemoveFromList(pFree); + + STAM_PROFILE_ADV_START(&pCache->StatTreeRemove, Cache); + RTAvlrU64Remove(pBlkCacheFree->pTree, pFree->Core.Key); + STAM_PROFILE_ADV_STOP(&pCache->StatTreeRemove, Cache); + + RTMemFree(pFree); + } + + RTSemRWReleaseWrite(pBlkCacheFree->SemRWEntries); + } + + if (pGhostListDst->cbCached + pCurr->cbData > pCache->cbRecentlyUsedOutMax) + { + /* Couldn't remove enough entries. Delete */ + STAM_PROFILE_ADV_START(&pCache->StatTreeRemove, Cache); + RTAvlrU64Remove(pCurr->pBlkCache->pTree, pCurr->Core.Key); + STAM_PROFILE_ADV_STOP(&pCache->StatTreeRemove, Cache); + + RTMemFree(pCurr); + } + else + pdmBlkCacheEntryAddToList(pGhostListDst, pCurr); + } + else + { + /* Delete the entry from the AVL tree it is assigned to. */ + STAM_PROFILE_ADV_START(&pCache->StatTreeRemove, Cache); + RTAvlrU64Remove(pCurr->pBlkCache->pTree, pCurr->Core.Key); + STAM_PROFILE_ADV_STOP(&pCache->StatTreeRemove, Cache); + + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + RTMemFree(pCurr); + } + } + + } + else + LogFlow(("Entry %#p (%u bytes) is still in progress and can't be evicted\n", pCurr, pCurr->cbData)); + } + + return cbEvicted; +} + +static bool pdmBlkCacheReclaim(PPDMBLKCACHEGLOBAL pCache, size_t cbData, bool fReuseBuffer, uint8_t **ppbBuffer) +{ + size_t cbRemoved = 0; + + if ((pCache->cbCached + cbData) < pCache->cbMax) + return true; + else if ((pCache->LruRecentlyUsedIn.cbCached + cbData) > pCache->cbRecentlyUsedInMax) + { + /* Try to evict as many bytes as possible from A1in */ + cbRemoved = pdmBlkCacheEvictPagesFrom(pCache, cbData, &pCache->LruRecentlyUsedIn, + &pCache->LruRecentlyUsedOut, fReuseBuffer, ppbBuffer); + + /* + * If it was not possible to remove enough entries + * try the frequently accessed cache. + */ + if (cbRemoved < cbData) + { + Assert(!fReuseBuffer || !*ppbBuffer); /* It is not possible that we got a buffer with the correct size but we didn't freed enough data. */ + + /* + * If we removed something we can't pass the reuse buffer flag anymore because + * we don't need to evict that much data + */ + if (!cbRemoved) + cbRemoved += pdmBlkCacheEvictPagesFrom(pCache, cbData, &pCache->LruFrequentlyUsed, + NULL, fReuseBuffer, ppbBuffer); + else + cbRemoved += pdmBlkCacheEvictPagesFrom(pCache, cbData - cbRemoved, &pCache->LruFrequentlyUsed, + NULL, false, NULL); + } + } + else + { + /* We have to remove entries from frequently access list. */ + cbRemoved = pdmBlkCacheEvictPagesFrom(pCache, cbData, &pCache->LruFrequentlyUsed, + NULL, fReuseBuffer, ppbBuffer); + } + + LogFlowFunc((": removed %u bytes, requested %u\n", cbRemoved, cbData)); + return (cbRemoved >= cbData); +} + +DECLINLINE(int) pdmBlkCacheEnqueue(PPDMBLKCACHE pBlkCache, uint64_t off, size_t cbXfer, PPDMBLKCACHEIOXFER pIoXfer) +{ + int rc = VINF_SUCCESS; + + LogFlowFunc(("%s: Enqueuing hIoXfer=%#p enmXferDir=%d\n", + __FUNCTION__, pIoXfer, pIoXfer->enmXferDir)); + + ASMAtomicIncU32(&pBlkCache->cIoXfersActive); + pdmBlkCacheR3TraceMsgF(pBlkCache, "BlkCache: I/O req %#p (%RTbool , %d) queued (%u now active)", + pIoXfer, pIoXfer->fIoCache, pIoXfer->enmXferDir, pBlkCache->cIoXfersActive); + + switch (pBlkCache->enmType) + { + case PDMBLKCACHETYPE_DEV: + { + rc = pBlkCache->u.Dev.pfnXferEnqueue(pBlkCache->u.Dev.pDevIns, + pIoXfer->enmXferDir, + off, cbXfer, + &pIoXfer->SgBuf, pIoXfer); + break; + } + case PDMBLKCACHETYPE_DRV: + { + rc = pBlkCache->u.Drv.pfnXferEnqueue(pBlkCache->u.Drv.pDrvIns, + pIoXfer->enmXferDir, + off, cbXfer, + &pIoXfer->SgBuf, pIoXfer); + break; + } + case PDMBLKCACHETYPE_USB: + { + rc = pBlkCache->u.Usb.pfnXferEnqueue(pBlkCache->u.Usb.pUsbIns, + pIoXfer->enmXferDir, + off, cbXfer, + &pIoXfer->SgBuf, pIoXfer); + break; + } + case PDMBLKCACHETYPE_INTERNAL: + { + rc = pBlkCache->u.Int.pfnXferEnqueue(pBlkCache->u.Int.pvUser, + pIoXfer->enmXferDir, + off, cbXfer, + &pIoXfer->SgBuf, pIoXfer); + break; + } + default: + AssertMsgFailed(("Unknown block cache type!\n")); + } + + if (RT_FAILURE(rc)) + { + pdmBlkCacheR3TraceMsgF(pBlkCache, "BlkCache: Queueing I/O req %#p failed %Rrc", pIoXfer, rc); + ASMAtomicDecU32(&pBlkCache->cIoXfersActive); + } + + LogFlowFunc(("%s: returns rc=%Rrc\n", __FUNCTION__, rc)); + return rc; +} + +/** + * Initiates a read I/O task for the given entry. + * + * @returns VBox status code. + * @param pEntry The entry to fetch the data to. + */ +static int pdmBlkCacheEntryReadFromMedium(PPDMBLKCACHEENTRY pEntry) +{ + PPDMBLKCACHE pBlkCache = pEntry->pBlkCache; + LogFlowFunc((": Reading data into cache entry %#p\n", pEntry)); + + /* Make sure no one evicts the entry while it is accessed. */ + pEntry->fFlags |= PDMBLKCACHE_ENTRY_IO_IN_PROGRESS; + + PPDMBLKCACHEIOXFER pIoXfer = (PPDMBLKCACHEIOXFER)RTMemAllocZ(sizeof(PDMBLKCACHEIOXFER)); + if (RT_UNLIKELY(!pIoXfer)) + return VERR_NO_MEMORY; + + AssertMsg(pEntry->pbData, ("Entry is in ghost state\n")); + + pIoXfer->fIoCache = true; + pIoXfer->pEntry = pEntry; + pIoXfer->SgSeg.pvSeg = pEntry->pbData; + pIoXfer->SgSeg.cbSeg = pEntry->cbData; + pIoXfer->enmXferDir = PDMBLKCACHEXFERDIR_READ; + RTSgBufInit(&pIoXfer->SgBuf, &pIoXfer->SgSeg, 1); + + return pdmBlkCacheEnqueue(pBlkCache, pEntry->Core.Key, pEntry->cbData, pIoXfer); +} + +/** + * Initiates a write I/O task for the given entry. + * + * @returns nothing. + * @param pEntry The entry to read the data from. + */ +static int pdmBlkCacheEntryWriteToMedium(PPDMBLKCACHEENTRY pEntry) +{ + PPDMBLKCACHE pBlkCache = pEntry->pBlkCache; + LogFlowFunc((": Writing data from cache entry %#p\n", pEntry)); + + /* Make sure no one evicts the entry while it is accessed. */ + pEntry->fFlags |= PDMBLKCACHE_ENTRY_IO_IN_PROGRESS; + + PPDMBLKCACHEIOXFER pIoXfer = (PPDMBLKCACHEIOXFER)RTMemAllocZ(sizeof(PDMBLKCACHEIOXFER)); + if (RT_UNLIKELY(!pIoXfer)) + return VERR_NO_MEMORY; + + AssertMsg(pEntry->pbData, ("Entry is in ghost state\n")); + + pIoXfer->fIoCache = true; + pIoXfer->pEntry = pEntry; + pIoXfer->SgSeg.pvSeg = pEntry->pbData; + pIoXfer->SgSeg.cbSeg = pEntry->cbData; + pIoXfer->enmXferDir = PDMBLKCACHEXFERDIR_WRITE; + RTSgBufInit(&pIoXfer->SgBuf, &pIoXfer->SgSeg, 1); + + return pdmBlkCacheEnqueue(pBlkCache, pEntry->Core.Key, pEntry->cbData, pIoXfer); +} + +/** + * Passthrough a part of a request directly to the I/O manager handling the + * endpoint. + * + * @returns VBox status code. + * @param pBlkCache The endpoint cache. + * @param pReq The request. + * @param pSgBuf The scatter/gather buffer. + * @param offStart Offset to start transfer from. + * @param cbData Amount of data to transfer. + * @param enmXferDir The transfer type (read/write) + */ +static int pdmBlkCacheRequestPassthrough(PPDMBLKCACHE pBlkCache, PPDMBLKCACHEREQ pReq, + PRTSGBUF pSgBuf, uint64_t offStart, size_t cbData, + PDMBLKCACHEXFERDIR enmXferDir) +{ + + PPDMBLKCACHEIOXFER pIoXfer = (PPDMBLKCACHEIOXFER)RTMemAllocZ(sizeof(PDMBLKCACHEIOXFER)); + if (RT_UNLIKELY(!pIoXfer)) + return VERR_NO_MEMORY; + + ASMAtomicIncU32(&pReq->cXfersPending); + pIoXfer->fIoCache = false; + pIoXfer->pReq = pReq; + pIoXfer->enmXferDir = enmXferDir; + if (pSgBuf) + { + RTSgBufClone(&pIoXfer->SgBuf, pSgBuf); + RTSgBufAdvance(pSgBuf, cbData); + } + + return pdmBlkCacheEnqueue(pBlkCache, offStart, cbData, pIoXfer); +} + +/** + * Commit a single dirty entry to the endpoint + * + * @returns nothing + * @param pEntry The entry to commit. + */ +static void pdmBlkCacheEntryCommit(PPDMBLKCACHEENTRY pEntry) +{ + AssertMsg( (pEntry->fFlags & PDMBLKCACHE_ENTRY_IS_DIRTY) + && !(pEntry->fFlags & PDMBLKCACHE_ENTRY_IO_IN_PROGRESS), + ("Invalid flags set for entry %#p\n", pEntry)); + + pdmBlkCacheEntryWriteToMedium(pEntry); +} + +/** + * Commit all dirty entries for a single endpoint. + * + * @returns nothing. + * @param pBlkCache The endpoint cache to commit. + */ +static void pdmBlkCacheCommit(PPDMBLKCACHE pBlkCache) +{ + uint32_t cbCommitted = 0; + + /* Return if the cache was suspended. */ + if (pBlkCache->fSuspended) + return; + + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + + /* The list is moved to a new header to reduce locking overhead. */ + RTLISTANCHOR ListDirtyNotCommitted; + + RTSpinlockAcquire(pBlkCache->LockList); + RTListMove(&ListDirtyNotCommitted, &pBlkCache->ListDirtyNotCommitted); + RTSpinlockRelease(pBlkCache->LockList); + + if (!RTListIsEmpty(&ListDirtyNotCommitted)) + { + PPDMBLKCACHEENTRY pEntry = RTListGetFirst(&ListDirtyNotCommitted, PDMBLKCACHEENTRY, NodeNotCommitted); + + while (!RTListNodeIsLast(&ListDirtyNotCommitted, &pEntry->NodeNotCommitted)) + { + PPDMBLKCACHEENTRY pNext = RTListNodeGetNext(&pEntry->NodeNotCommitted, PDMBLKCACHEENTRY, + NodeNotCommitted); + pdmBlkCacheEntryCommit(pEntry); + cbCommitted += pEntry->cbData; + RTListNodeRemove(&pEntry->NodeNotCommitted); + pEntry = pNext; + } + + /* Commit the last endpoint */ + Assert(RTListNodeIsLast(&ListDirtyNotCommitted, &pEntry->NodeNotCommitted)); + pdmBlkCacheEntryCommit(pEntry); + cbCommitted += pEntry->cbData; + RTListNodeRemove(&pEntry->NodeNotCommitted); + AssertMsg(RTListIsEmpty(&ListDirtyNotCommitted), + ("Committed all entries but list is not empty\n")); + } + + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + AssertMsg(pBlkCache->pCache->cbDirty >= cbCommitted, + ("Number of committed bytes exceeds number of dirty bytes\n")); + uint32_t cbDirtyOld = ASMAtomicSubU32(&pBlkCache->pCache->cbDirty, cbCommitted); + + /* Reset the commit timer if we don't have any dirty bits. */ + if ( !(cbDirtyOld - cbCommitted) + && pBlkCache->pCache->u32CommitTimeoutMs != 0) + TMTimerStop(pBlkCache->pCache->pTimerCommit); +} + +/** + * Commit all dirty entries in the cache. + * + * @returns nothing. + * @param pCache The global cache instance. + */ +static void pdmBlkCacheCommitDirtyEntries(PPDMBLKCACHEGLOBAL pCache) +{ + bool fCommitInProgress = ASMAtomicXchgBool(&pCache->fCommitInProgress, true); + + if (!fCommitInProgress) + { + pdmBlkCacheLockEnter(pCache); + Assert(!RTListIsEmpty(&pCache->ListUsers)); + + PPDMBLKCACHE pBlkCache = RTListGetFirst(&pCache->ListUsers, PDMBLKCACHE, NodeCacheUser); + AssertPtr(pBlkCache); + + while (!RTListNodeIsLast(&pCache->ListUsers, &pBlkCache->NodeCacheUser)) + { + pdmBlkCacheCommit(pBlkCache); + + pBlkCache = RTListNodeGetNext(&pBlkCache->NodeCacheUser, PDMBLKCACHE, + NodeCacheUser); + } + + /* Commit the last endpoint */ + Assert(RTListNodeIsLast(&pCache->ListUsers, &pBlkCache->NodeCacheUser)); + pdmBlkCacheCommit(pBlkCache); + + pdmBlkCacheLockLeave(pCache); + ASMAtomicWriteBool(&pCache->fCommitInProgress, false); + } +} + +/** + * Adds the given entry as a dirty to the cache. + * + * @returns Flag whether the amount of dirty bytes in the cache exceeds the threshold + * @param pBlkCache The endpoint cache the entry belongs to. + * @param pEntry The entry to add. + */ +static bool pdmBlkCacheAddDirtyEntry(PPDMBLKCACHE pBlkCache, PPDMBLKCACHEENTRY pEntry) +{ + bool fDirtyBytesExceeded = false; + PPDMBLKCACHEGLOBAL pCache = pBlkCache->pCache; + + /* If the commit timer is disabled we commit right away. */ + if (pCache->u32CommitTimeoutMs == 0) + { + pEntry->fFlags |= PDMBLKCACHE_ENTRY_IS_DIRTY; + pdmBlkCacheEntryCommit(pEntry); + } + else if (!(pEntry->fFlags & PDMBLKCACHE_ENTRY_IS_DIRTY)) + { + pEntry->fFlags |= PDMBLKCACHE_ENTRY_IS_DIRTY; + + RTSpinlockAcquire(pBlkCache->LockList); + RTListAppend(&pBlkCache->ListDirtyNotCommitted, &pEntry->NodeNotCommitted); + RTSpinlockRelease(pBlkCache->LockList); + + uint32_t cbDirty = ASMAtomicAddU32(&pCache->cbDirty, pEntry->cbData); + + /* Prevent committing if the VM was suspended. */ + if (RT_LIKELY(!ASMAtomicReadBool(&pCache->fIoErrorVmSuspended))) + fDirtyBytesExceeded = (cbDirty + pEntry->cbData >= pCache->cbCommitDirtyThreshold); + else if (!cbDirty && pCache->u32CommitTimeoutMs > 0) + { + /* Arm the commit timer. */ + TMTimerSetMillies(pCache->pTimerCommit, pCache->u32CommitTimeoutMs); + } + } + + return fDirtyBytesExceeded; +} + +static PPDMBLKCACHE pdmR3BlkCacheFindById(PPDMBLKCACHEGLOBAL pBlkCacheGlobal, const char *pcszId) +{ + bool fFound = false; + + PPDMBLKCACHE pBlkCache; + RTListForEach(&pBlkCacheGlobal->ListUsers, pBlkCache, PDMBLKCACHE, NodeCacheUser) + { + if (!RTStrCmp(pBlkCache->pszId, pcszId)) + { + fFound = true; + break; + } + } + + return fFound ? pBlkCache : NULL; +} + +/** + * Commit timer callback. + */ +static DECLCALLBACK(void) pdmBlkCacheCommitTimerCallback(PVM pVM, PTMTIMER pTimer, void *pvUser) +{ + PPDMBLKCACHEGLOBAL pCache = (PPDMBLKCACHEGLOBAL)pvUser; + NOREF(pVM); NOREF(pTimer); + + LogFlowFunc(("Commit interval expired, commiting dirty entries\n")); + + if ( ASMAtomicReadU32(&pCache->cbDirty) > 0 + && !ASMAtomicReadBool(&pCache->fIoErrorVmSuspended)) + pdmBlkCacheCommitDirtyEntries(pCache); + + LogFlowFunc(("Entries committed, going to sleep\n")); +} + +static DECLCALLBACK(int) pdmR3BlkCacheSaveExec(PVM pVM, PSSMHANDLE pSSM) +{ + PPDMBLKCACHEGLOBAL pBlkCacheGlobal = pVM->pUVM->pdm.s.pBlkCacheGlobal; + + AssertPtr(pBlkCacheGlobal); + + pdmBlkCacheLockEnter(pBlkCacheGlobal); + + SSMR3PutU32(pSSM, pBlkCacheGlobal->cRefs); + + /* Go through the list and save all dirty entries. */ + PPDMBLKCACHE pBlkCache; + RTListForEach(&pBlkCacheGlobal->ListUsers, pBlkCache, PDMBLKCACHE, NodeCacheUser) + { + uint32_t cEntries = 0; + PPDMBLKCACHEENTRY pEntry; + + RTSemRWRequestRead(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + SSMR3PutU32(pSSM, (uint32_t)strlen(pBlkCache->pszId)); + SSMR3PutStrZ(pSSM, pBlkCache->pszId); + + /* Count the number of entries to safe. */ + RTListForEach(&pBlkCache->ListDirtyNotCommitted, pEntry, PDMBLKCACHEENTRY, NodeNotCommitted) + { + cEntries++; + } + + SSMR3PutU32(pSSM, cEntries); + + /* Walk the list of all dirty entries and save them. */ + RTListForEach(&pBlkCache->ListDirtyNotCommitted, pEntry, PDMBLKCACHEENTRY, NodeNotCommitted) + { + /* A few sanity checks. */ + AssertMsg(!pEntry->cRefs, ("The entry is still referenced\n")); + AssertMsg(pEntry->fFlags & PDMBLKCACHE_ENTRY_IS_DIRTY, ("Entry is not dirty\n")); + AssertMsg(!(pEntry->fFlags & ~PDMBLKCACHE_ENTRY_IS_DIRTY), ("Invalid flags set\n")); + AssertMsg(!pEntry->pWaitingHead && !pEntry->pWaitingTail, ("There are waiting requests\n")); + AssertMsg( pEntry->pList == &pBlkCacheGlobal->LruRecentlyUsedIn + || pEntry->pList == &pBlkCacheGlobal->LruFrequentlyUsed, + ("Invalid list\n")); + AssertMsg(pEntry->cbData == pEntry->Core.KeyLast - pEntry->Core.Key + 1, + ("Size and range do not match\n")); + + /* Save */ + SSMR3PutU64(pSSM, pEntry->Core.Key); + SSMR3PutU32(pSSM, pEntry->cbData); + SSMR3PutMem(pSSM, pEntry->pbData, pEntry->cbData); + } + + RTSemRWReleaseRead(pBlkCache->SemRWEntries); + } + + pdmBlkCacheLockLeave(pBlkCacheGlobal); + + /* Terminator */ + return SSMR3PutU32(pSSM, UINT32_MAX); +} + +static DECLCALLBACK(int) pdmR3BlkCacheLoadExec(PVM pVM, PSSMHANDLE pSSM, uint32_t uVersion, uint32_t uPass) +{ + PPDMBLKCACHEGLOBAL pBlkCacheGlobal = pVM->pUVM->pdm.s.pBlkCacheGlobal; + uint32_t cRefs; + + NOREF(uPass); + AssertPtr(pBlkCacheGlobal); + + pdmBlkCacheLockEnter(pBlkCacheGlobal); + + if (uVersion != PDM_BLK_CACHE_SAVED_STATE_VERSION) + return VERR_SSM_UNSUPPORTED_DATA_UNIT_VERSION; + + SSMR3GetU32(pSSM, &cRefs); + + /* + * Fewer users in the saved state than in the current VM are allowed + * because that means that there are only new ones which don't have any saved state + * which can get lost. + * More saved state entries than registered cache users are only allowed if the + * missing users don't have any data saved in the cache. + */ + int rc = VINF_SUCCESS; + char *pszId = NULL; + + while ( cRefs > 0 + && RT_SUCCESS(rc)) + { + PPDMBLKCACHE pBlkCache = NULL; + uint32_t cbId = 0; + + SSMR3GetU32(pSSM, &cbId); + Assert(cbId > 0); + + cbId++; /* Include terminator */ + pszId = (char *)RTMemAllocZ(cbId * sizeof(char)); + if (!pszId) + { + rc = VERR_NO_MEMORY; + break; + } + + rc = SSMR3GetStrZ(pSSM, pszId, cbId); + AssertRC(rc); + + /* Search for the block cache with the provided id. */ + pBlkCache = pdmR3BlkCacheFindById(pBlkCacheGlobal, pszId); + + /* Get the entries */ + uint32_t cEntries; + SSMR3GetU32(pSSM, &cEntries); + + if (!pBlkCache && (cEntries > 0)) + { + rc = SSMR3SetCfgError(pSSM, RT_SRC_POS, + N_("The VM is missing a block device and there is data in the cache. Please make sure the source and target VMs have compatible storage configurations")); + break; + } + + RTMemFree(pszId); + pszId = NULL; + + while (cEntries > 0) + { + PPDMBLKCACHEENTRY pEntry; + uint64_t off; + uint32_t cbEntry; + + SSMR3GetU64(pSSM, &off); + SSMR3GetU32(pSSM, &cbEntry); + + pEntry = pdmBlkCacheEntryAlloc(pBlkCache, off, cbEntry, NULL); + if (!pEntry) + { + rc = VERR_NO_MEMORY; + break; + } + + rc = SSMR3GetMem(pSSM, pEntry->pbData, cbEntry); + if (RT_FAILURE(rc)) + { + RTMemFree(pEntry->pbData); + RTMemFree(pEntry); + break; + } + + /* Insert into the tree. */ + bool fInserted = RTAvlrU64Insert(pBlkCache->pTree, &pEntry->Core); + Assert(fInserted); NOREF(fInserted); + + /* Add to the dirty list. */ + pdmBlkCacheAddDirtyEntry(pBlkCache, pEntry); + pdmBlkCacheEntryAddToList(&pBlkCacheGlobal->LruRecentlyUsedIn, pEntry); + pdmBlkCacheAdd(pBlkCacheGlobal, cbEntry); + pdmBlkCacheEntryRelease(pEntry); + cEntries--; + } + + cRefs--; + } + + if (pszId) + RTMemFree(pszId); + + if (cRefs && RT_SUCCESS(rc)) + rc = SSMR3SetCfgError(pSSM, RT_SRC_POS, + N_("Unexpected error while restoring state. Please make sure the source and target VMs have compatible storage configurations")); + + pdmBlkCacheLockLeave(pBlkCacheGlobal); + + if (RT_SUCCESS(rc)) + { + uint32_t u32 = 0; + rc = SSMR3GetU32(pSSM, &u32); + if (RT_SUCCESS(rc)) + AssertMsgReturn(u32 == UINT32_MAX, ("%#x\n", u32), VERR_SSM_DATA_UNIT_FORMAT_CHANGED); + } + + return rc; +} + +int pdmR3BlkCacheInit(PVM pVM) +{ + int rc = VINF_SUCCESS; + PUVM pUVM = pVM->pUVM; + PPDMBLKCACHEGLOBAL pBlkCacheGlobal; + + LogFlowFunc((": pVM=%p\n", pVM)); + + VM_ASSERT_EMT(pVM); + + PCFGMNODE pCfgRoot = CFGMR3GetRoot(pVM); + PCFGMNODE pCfgBlkCache = CFGMR3GetChild(CFGMR3GetChild(pCfgRoot, "PDM"), "BlkCache"); + + pBlkCacheGlobal = (PPDMBLKCACHEGLOBAL)RTMemAllocZ(sizeof(PDMBLKCACHEGLOBAL)); + if (!pBlkCacheGlobal) + return VERR_NO_MEMORY; + + RTListInit(&pBlkCacheGlobal->ListUsers); + pBlkCacheGlobal->pVM = pVM; + pBlkCacheGlobal->cRefs = 0; + pBlkCacheGlobal->cbCached = 0; + pBlkCacheGlobal->fCommitInProgress = false; + + /* Initialize members */ + pBlkCacheGlobal->LruRecentlyUsedIn.pHead = NULL; + pBlkCacheGlobal->LruRecentlyUsedIn.pTail = NULL; + pBlkCacheGlobal->LruRecentlyUsedIn.cbCached = 0; + + pBlkCacheGlobal->LruRecentlyUsedOut.pHead = NULL; + pBlkCacheGlobal->LruRecentlyUsedOut.pTail = NULL; + pBlkCacheGlobal->LruRecentlyUsedOut.cbCached = 0; + + pBlkCacheGlobal->LruFrequentlyUsed.pHead = NULL; + pBlkCacheGlobal->LruFrequentlyUsed.pTail = NULL; + pBlkCacheGlobal->LruFrequentlyUsed.cbCached = 0; + + do + { + rc = CFGMR3QueryU32Def(pCfgBlkCache, "CacheSize", &pBlkCacheGlobal->cbMax, 5 * _1M); + AssertLogRelRCBreak(rc); + LogFlowFunc(("Maximum number of bytes cached %u\n", pBlkCacheGlobal->cbMax)); + + pBlkCacheGlobal->cbRecentlyUsedInMax = (pBlkCacheGlobal->cbMax / 100) * 25; /* 25% of the buffer size */ + pBlkCacheGlobal->cbRecentlyUsedOutMax = (pBlkCacheGlobal->cbMax / 100) * 50; /* 50% of the buffer size */ + LogFlowFunc(("cbRecentlyUsedInMax=%u cbRecentlyUsedOutMax=%u\n", + pBlkCacheGlobal->cbRecentlyUsedInMax, pBlkCacheGlobal->cbRecentlyUsedOutMax)); + + /** @todo r=aeichner: Experiment to find optimal default values */ + rc = CFGMR3QueryU32Def(pCfgBlkCache, "CacheCommitIntervalMs", &pBlkCacheGlobal->u32CommitTimeoutMs, 10000 /* 10sec */); + AssertLogRelRCBreak(rc); + rc = CFGMR3QueryU32Def(pCfgBlkCache, "CacheCommitThreshold", &pBlkCacheGlobal->cbCommitDirtyThreshold, pBlkCacheGlobal->cbMax / 2); + AssertLogRelRCBreak(rc); + } while (0); + + if (RT_SUCCESS(rc)) + { + STAMR3Register(pVM, &pBlkCacheGlobal->cbMax, + STAMTYPE_U32, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/cbMax", + STAMUNIT_BYTES, + "Maximum cache size"); + STAMR3Register(pVM, &pBlkCacheGlobal->cbCached, + STAMTYPE_U32, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/cbCached", + STAMUNIT_BYTES, + "Currently used cache"); + STAMR3Register(pVM, &pBlkCacheGlobal->LruRecentlyUsedIn.cbCached, + STAMTYPE_U32, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/cbCachedMruIn", + STAMUNIT_BYTES, + "Number of bytes cached in MRU list"); + STAMR3Register(pVM, &pBlkCacheGlobal->LruRecentlyUsedOut.cbCached, + STAMTYPE_U32, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/cbCachedMruOut", + STAMUNIT_BYTES, + "Number of bytes cached in FRU list"); + STAMR3Register(pVM, &pBlkCacheGlobal->LruFrequentlyUsed.cbCached, + STAMTYPE_U32, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/cbCachedFru", + STAMUNIT_BYTES, + "Number of bytes cached in FRU ghost list"); + +#ifdef VBOX_WITH_STATISTICS + STAMR3Register(pVM, &pBlkCacheGlobal->cHits, + STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/CacheHits", + STAMUNIT_COUNT, "Number of hits in the cache"); + STAMR3Register(pVM, &pBlkCacheGlobal->cPartialHits, + STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/CachePartialHits", + STAMUNIT_COUNT, "Number of partial hits in the cache"); + STAMR3Register(pVM, &pBlkCacheGlobal->cMisses, + STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/CacheMisses", + STAMUNIT_COUNT, "Number of misses when accessing the cache"); + STAMR3Register(pVM, &pBlkCacheGlobal->StatRead, + STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/CacheRead", + STAMUNIT_BYTES, "Number of bytes read from the cache"); + STAMR3Register(pVM, &pBlkCacheGlobal->StatWritten, + STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/CacheWritten", + STAMUNIT_BYTES, "Number of bytes written to the cache"); + STAMR3Register(pVM, &pBlkCacheGlobal->StatTreeGet, + STAMTYPE_PROFILE_ADV, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/CacheTreeGet", + STAMUNIT_TICKS_PER_CALL, "Time taken to access an entry in the tree"); + STAMR3Register(pVM, &pBlkCacheGlobal->StatTreeInsert, + STAMTYPE_PROFILE_ADV, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/CacheTreeInsert", + STAMUNIT_TICKS_PER_CALL, "Time taken to insert an entry in the tree"); + STAMR3Register(pVM, &pBlkCacheGlobal->StatTreeRemove, + STAMTYPE_PROFILE_ADV, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/CacheTreeRemove", + STAMUNIT_TICKS_PER_CALL, "Time taken to remove an entry an the tree"); + STAMR3Register(pVM, &pBlkCacheGlobal->StatBuffersReused, + STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, + "/PDM/BlkCache/CacheBuffersReused", + STAMUNIT_COUNT, "Number of times a buffer could be reused"); +#endif + + /* Initialize the critical section */ + rc = RTCritSectInit(&pBlkCacheGlobal->CritSect); + } + + if (RT_SUCCESS(rc)) + { + /* Create the commit timer */ + if (pBlkCacheGlobal->u32CommitTimeoutMs > 0) + rc = TMR3TimerCreateInternal(pVM, TMCLOCK_REAL, + pdmBlkCacheCommitTimerCallback, + pBlkCacheGlobal, + "BlkCache-Commit", + &pBlkCacheGlobal->pTimerCommit); + + if (RT_SUCCESS(rc)) + { + /* Register saved state handler. */ + rc = SSMR3RegisterInternal(pVM, "pdmblkcache", 0, PDM_BLK_CACHE_SAVED_STATE_VERSION, pBlkCacheGlobal->cbMax, + NULL, NULL, NULL, + NULL, pdmR3BlkCacheSaveExec, NULL, + NULL, pdmR3BlkCacheLoadExec, NULL); + if (RT_SUCCESS(rc)) + { + LogRel(("BlkCache: Cache successfully initialized. Cache size is %u bytes\n", pBlkCacheGlobal->cbMax)); + LogRel(("BlkCache: Cache commit interval is %u ms\n", pBlkCacheGlobal->u32CommitTimeoutMs)); + LogRel(("BlkCache: Cache commit threshold is %u bytes\n", pBlkCacheGlobal->cbCommitDirtyThreshold)); + pUVM->pdm.s.pBlkCacheGlobal = pBlkCacheGlobal; + return VINF_SUCCESS; + } + } + + RTCritSectDelete(&pBlkCacheGlobal->CritSect); + } + + if (pBlkCacheGlobal) + RTMemFree(pBlkCacheGlobal); + + LogFlowFunc((": returns rc=%Rrc\n", rc)); + return rc; +} + +void pdmR3BlkCacheTerm(PVM pVM) +{ + PPDMBLKCACHEGLOBAL pBlkCacheGlobal = pVM->pUVM->pdm.s.pBlkCacheGlobal; + + if (pBlkCacheGlobal) + { + /* Make sure no one else uses the cache now */ + pdmBlkCacheLockEnter(pBlkCacheGlobal); + + /* Cleanup deleting all cache entries waiting for in progress entries to finish. */ + pdmBlkCacheDestroyList(&pBlkCacheGlobal->LruRecentlyUsedIn); + pdmBlkCacheDestroyList(&pBlkCacheGlobal->LruRecentlyUsedOut); + pdmBlkCacheDestroyList(&pBlkCacheGlobal->LruFrequentlyUsed); + + pdmBlkCacheLockLeave(pBlkCacheGlobal); + + RTCritSectDelete(&pBlkCacheGlobal->CritSect); + RTMemFree(pBlkCacheGlobal); + pVM->pUVM->pdm.s.pBlkCacheGlobal = NULL; + } +} + +int pdmR3BlkCacheResume(PVM pVM) +{ + PPDMBLKCACHEGLOBAL pBlkCacheGlobal = pVM->pUVM->pdm.s.pBlkCacheGlobal; + + LogFlowFunc(("pVM=%#p\n", pVM)); + + if ( pBlkCacheGlobal + && ASMAtomicXchgBool(&pBlkCacheGlobal->fIoErrorVmSuspended, false)) + { + /* The VM was suspended because of an I/O error, commit all dirty entries. */ + pdmBlkCacheCommitDirtyEntries(pBlkCacheGlobal); + } + + return VINF_SUCCESS; +} + +static int pdmR3BlkCacheRetain(PVM pVM, PPPDMBLKCACHE ppBlkCache, const char *pcszId) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHE pBlkCache = NULL; + PPDMBLKCACHEGLOBAL pBlkCacheGlobal = pVM->pUVM->pdm.s.pBlkCacheGlobal; + + if (!pBlkCacheGlobal) + return VERR_NOT_SUPPORTED; + + /* + * Check that no other user cache has the same id first, + * Unique id's are necessary in case the state is saved. + */ + pdmBlkCacheLockEnter(pBlkCacheGlobal); + + pBlkCache = pdmR3BlkCacheFindById(pBlkCacheGlobal, pcszId); + + if (!pBlkCache) + { + pBlkCache = (PPDMBLKCACHE)RTMemAllocZ(sizeof(PDMBLKCACHE)); + + if (pBlkCache) + pBlkCache->pszId = RTStrDup(pcszId); + + if ( pBlkCache + && pBlkCache->pszId) + { + pBlkCache->fSuspended = false; + pBlkCache->cIoXfersActive = 0; + pBlkCache->pCache = pBlkCacheGlobal; + RTListInit(&pBlkCache->ListDirtyNotCommitted); + + rc = RTSpinlockCreate(&pBlkCache->LockList, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "pdmR3BlkCacheRetain"); + if (RT_SUCCESS(rc)) + { + rc = RTSemRWCreate(&pBlkCache->SemRWEntries); + if (RT_SUCCESS(rc)) + { + pBlkCache->pTree = (PAVLRU64TREE)RTMemAllocZ(sizeof(AVLRFOFFTREE)); + if (pBlkCache->pTree) + { +#ifdef VBOX_WITH_STATISTICS + STAMR3RegisterF(pBlkCacheGlobal->pVM, &pBlkCache->StatWriteDeferred, + STAMTYPE_COUNTER, STAMVISIBILITY_ALWAYS, + STAMUNIT_COUNT, "Number of deferred writes", + "/PDM/BlkCache/%s/Cache/DeferredWrites", pBlkCache->pszId); +#endif + + /* Add to the list of users. */ + pBlkCacheGlobal->cRefs++; + RTListAppend(&pBlkCacheGlobal->ListUsers, &pBlkCache->NodeCacheUser); + pdmBlkCacheLockLeave(pBlkCacheGlobal); + + *ppBlkCache = pBlkCache; + LogFlowFunc(("returns success\n")); + return VINF_SUCCESS; + } + + rc = VERR_NO_MEMORY; + RTSemRWDestroy(pBlkCache->SemRWEntries); + } + + RTSpinlockDestroy(pBlkCache->LockList); + } + + RTStrFree(pBlkCache->pszId); + } + else + rc = VERR_NO_MEMORY; + + if (pBlkCache) + RTMemFree(pBlkCache); + } + else + rc = VERR_ALREADY_EXISTS; + + pdmBlkCacheLockLeave(pBlkCacheGlobal); + + LogFlowFunc(("Leave rc=%Rrc\n", rc)); + return rc; +} + +VMMR3DECL(int) PDMR3BlkCacheRetainDriver(PVM pVM, PPDMDRVINS pDrvIns, PPPDMBLKCACHE ppBlkCache, + PFNPDMBLKCACHEXFERCOMPLETEDRV pfnXferComplete, + PFNPDMBLKCACHEXFERENQUEUEDRV pfnXferEnqueue, + PFNPDMBLKCACHEXFERENQUEUEDISCARDDRV pfnXferEnqueueDiscard, + const char *pcszId) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHE pBlkCache; + + rc = pdmR3BlkCacheRetain(pVM, &pBlkCache, pcszId); + if (RT_SUCCESS(rc)) + { + pBlkCache->enmType = PDMBLKCACHETYPE_DRV; + pBlkCache->u.Drv.pfnXferComplete = pfnXferComplete; + pBlkCache->u.Drv.pfnXferEnqueue = pfnXferEnqueue; + pBlkCache->u.Drv.pfnXferEnqueueDiscard = pfnXferEnqueueDiscard; + pBlkCache->u.Drv.pDrvIns = pDrvIns; + *ppBlkCache = pBlkCache; + } + + LogFlowFunc(("Leave rc=%Rrc\n", rc)); + return rc; +} + +VMMR3DECL(int) PDMR3BlkCacheRetainDevice(PVM pVM, PPDMDEVINS pDevIns, PPPDMBLKCACHE ppBlkCache, + PFNPDMBLKCACHEXFERCOMPLETEDEV pfnXferComplete, + PFNPDMBLKCACHEXFERENQUEUEDEV pfnXferEnqueue, + PFNPDMBLKCACHEXFERENQUEUEDISCARDDEV pfnXferEnqueueDiscard, + const char *pcszId) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHE pBlkCache; + + rc = pdmR3BlkCacheRetain(pVM, &pBlkCache, pcszId); + if (RT_SUCCESS(rc)) + { + pBlkCache->enmType = PDMBLKCACHETYPE_DEV; + pBlkCache->u.Dev.pfnXferComplete = pfnXferComplete; + pBlkCache->u.Dev.pfnXferEnqueue = pfnXferEnqueue; + pBlkCache->u.Dev.pfnXferEnqueueDiscard = pfnXferEnqueueDiscard; + pBlkCache->u.Dev.pDevIns = pDevIns; + *ppBlkCache = pBlkCache; + } + + LogFlowFunc(("Leave rc=%Rrc\n", rc)); + return rc; + +} + +VMMR3DECL(int) PDMR3BlkCacheRetainUsb(PVM pVM, PPDMUSBINS pUsbIns, PPPDMBLKCACHE ppBlkCache, + PFNPDMBLKCACHEXFERCOMPLETEUSB pfnXferComplete, + PFNPDMBLKCACHEXFERENQUEUEUSB pfnXferEnqueue, + PFNPDMBLKCACHEXFERENQUEUEDISCARDUSB pfnXferEnqueueDiscard, + const char *pcszId) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHE pBlkCache; + + rc = pdmR3BlkCacheRetain(pVM, &pBlkCache, pcszId); + if (RT_SUCCESS(rc)) + { + pBlkCache->enmType = PDMBLKCACHETYPE_USB; + pBlkCache->u.Usb.pfnXferComplete = pfnXferComplete; + pBlkCache->u.Usb.pfnXferEnqueue = pfnXferEnqueue; + pBlkCache->u.Usb.pfnXferEnqueueDiscard = pfnXferEnqueueDiscard; + pBlkCache->u.Usb.pUsbIns = pUsbIns; + *ppBlkCache = pBlkCache; + } + + LogFlowFunc(("Leave rc=%Rrc\n", rc)); + return rc; + +} + +VMMR3DECL(int) PDMR3BlkCacheRetainInt(PVM pVM, void *pvUser, PPPDMBLKCACHE ppBlkCache, + PFNPDMBLKCACHEXFERCOMPLETEINT pfnXferComplete, + PFNPDMBLKCACHEXFERENQUEUEINT pfnXferEnqueue, + PFNPDMBLKCACHEXFERENQUEUEDISCARDINT pfnXferEnqueueDiscard, + const char *pcszId) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHE pBlkCache; + + rc = pdmR3BlkCacheRetain(pVM, &pBlkCache, pcszId); + if (RT_SUCCESS(rc)) + { + pBlkCache->enmType = PDMBLKCACHETYPE_INTERNAL; + pBlkCache->u.Int.pfnXferComplete = pfnXferComplete; + pBlkCache->u.Int.pfnXferEnqueue = pfnXferEnqueue; + pBlkCache->u.Int.pfnXferEnqueueDiscard = pfnXferEnqueueDiscard; + pBlkCache->u.Int.pvUser = pvUser; + *ppBlkCache = pBlkCache; + } + + LogFlowFunc(("Leave rc=%Rrc\n", rc)); + return rc; + +} + +/** + * Callback for the AVL destroy routine. Frees a cache entry for this endpoint. + * + * @returns IPRT status code. + * @param pNode The node to destroy. + * @param pvUser Opaque user data. + */ +static DECLCALLBACK(int) pdmBlkCacheEntryDestroy(PAVLRU64NODECORE pNode, void *pvUser) +{ + PPDMBLKCACHEENTRY pEntry = (PPDMBLKCACHEENTRY)pNode; + PPDMBLKCACHEGLOBAL pCache = (PPDMBLKCACHEGLOBAL)pvUser; + PPDMBLKCACHE pBlkCache = pEntry->pBlkCache; + + while (ASMAtomicReadU32(&pEntry->fFlags) & PDMBLKCACHE_ENTRY_IO_IN_PROGRESS) + { + /* Leave the locks to let the I/O thread make progress but reference the entry to prevent eviction. */ + pdmBlkCacheEntryRef(pEntry); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + pdmBlkCacheLockLeave(pCache); + + RTThreadSleep(250); + + /* Re-enter all locks */ + pdmBlkCacheLockEnter(pCache); + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + pdmBlkCacheEntryRelease(pEntry); + } + + AssertMsg(!(pEntry->fFlags & PDMBLKCACHE_ENTRY_IO_IN_PROGRESS), + ("Entry is dirty and/or still in progress fFlags=%#x\n", pEntry->fFlags)); + + bool fUpdateCache = pEntry->pList == &pCache->LruFrequentlyUsed + || pEntry->pList == &pCache->LruRecentlyUsedIn; + + pdmBlkCacheEntryRemoveFromList(pEntry); + + if (fUpdateCache) + pdmBlkCacheSub(pCache, pEntry->cbData); + + RTMemPageFree(pEntry->pbData, pEntry->cbData); + RTMemFree(pEntry); + + return VINF_SUCCESS; +} + +/** + * Destroys all cache resources used by the given endpoint. + * + * @returns nothing. + * @param pBlkCache Block cache handle. + */ +VMMR3DECL(void) PDMR3BlkCacheRelease(PPDMBLKCACHE pBlkCache) +{ + PPDMBLKCACHEGLOBAL pCache = pBlkCache->pCache; + + /* + * Commit all dirty entries now (they are waited on for completion during the + * destruction of the AVL tree below). + * The exception is if the VM was paused because of an I/O error before. + */ + if (!ASMAtomicReadBool(&pCache->fIoErrorVmSuspended)) + pdmBlkCacheCommit(pBlkCache); + + /* Make sure nobody is accessing the cache while we delete the tree. */ + pdmBlkCacheLockEnter(pCache); + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + RTAvlrU64Destroy(pBlkCache->pTree, pdmBlkCacheEntryDestroy, pCache); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + RTSpinlockDestroy(pBlkCache->LockList); + + pCache->cRefs--; + RTListNodeRemove(&pBlkCache->NodeCacheUser); + + pdmBlkCacheLockLeave(pCache); + + RTMemFree(pBlkCache->pTree); + pBlkCache->pTree = NULL; + RTSemRWDestroy(pBlkCache->SemRWEntries); + +#ifdef VBOX_WITH_STATISTICS + STAMR3DeregisterF(pCache->pVM->pUVM, "/PDM/BlkCache/%s/Cache/DeferredWrites", pBlkCache->pszId); +#endif + + RTStrFree(pBlkCache->pszId); + RTMemFree(pBlkCache); +} + +VMMR3DECL(void) PDMR3BlkCacheReleaseDevice(PVM pVM, PPDMDEVINS pDevIns) +{ + LogFlow(("%s: pDevIns=%p\n", __FUNCTION__, pDevIns)); + + /* + * Validate input. + */ + if (!pDevIns) + return; + VM_ASSERT_EMT(pVM); + + PPDMBLKCACHEGLOBAL pBlkCacheGlobal = pVM->pUVM->pdm.s.pBlkCacheGlobal; + PPDMBLKCACHE pBlkCache, pBlkCacheNext; + + /* Return silently if not supported. */ + if (!pBlkCacheGlobal) + return; + + pdmBlkCacheLockEnter(pBlkCacheGlobal); + + RTListForEachSafe(&pBlkCacheGlobal->ListUsers, pBlkCache, pBlkCacheNext, PDMBLKCACHE, NodeCacheUser) + { + if ( pBlkCache->enmType == PDMBLKCACHETYPE_DEV + && pBlkCache->u.Dev.pDevIns == pDevIns) + PDMR3BlkCacheRelease(pBlkCache); + } + + pdmBlkCacheLockLeave(pBlkCacheGlobal); +} + +VMMR3DECL(void) PDMR3BlkCacheReleaseDriver(PVM pVM, PPDMDRVINS pDrvIns) +{ + LogFlow(("%s: pDrvIns=%p\n", __FUNCTION__, pDrvIns)); + + /* + * Validate input. + */ + if (!pDrvIns) + return; + VM_ASSERT_EMT(pVM); + + PPDMBLKCACHEGLOBAL pBlkCacheGlobal = pVM->pUVM->pdm.s.pBlkCacheGlobal; + PPDMBLKCACHE pBlkCache, pBlkCacheNext; + + /* Return silently if not supported. */ + if (!pBlkCacheGlobal) + return; + + pdmBlkCacheLockEnter(pBlkCacheGlobal); + + RTListForEachSafe(&pBlkCacheGlobal->ListUsers, pBlkCache, pBlkCacheNext, PDMBLKCACHE, NodeCacheUser) + { + if ( pBlkCache->enmType == PDMBLKCACHETYPE_DRV + && pBlkCache->u.Drv.pDrvIns == pDrvIns) + PDMR3BlkCacheRelease(pBlkCache); + } + + pdmBlkCacheLockLeave(pBlkCacheGlobal); +} + +VMMR3DECL(void) PDMR3BlkCacheReleaseUsb(PVM pVM, PPDMUSBINS pUsbIns) +{ + LogFlow(("%s: pUsbIns=%p\n", __FUNCTION__, pUsbIns)); + + /* + * Validate input. + */ + if (!pUsbIns) + return; + VM_ASSERT_EMT(pVM); + + PPDMBLKCACHEGLOBAL pBlkCacheGlobal = pVM->pUVM->pdm.s.pBlkCacheGlobal; + PPDMBLKCACHE pBlkCache, pBlkCacheNext; + + /* Return silently if not supported. */ + if (!pBlkCacheGlobal) + return; + + pdmBlkCacheLockEnter(pBlkCacheGlobal); + + RTListForEachSafe(&pBlkCacheGlobal->ListUsers, pBlkCache, pBlkCacheNext, PDMBLKCACHE, NodeCacheUser) + { + if ( pBlkCache->enmType == PDMBLKCACHETYPE_USB + && pBlkCache->u.Usb.pUsbIns == pUsbIns) + PDMR3BlkCacheRelease(pBlkCache); + } + + pdmBlkCacheLockLeave(pBlkCacheGlobal); +} + +static PPDMBLKCACHEENTRY pdmBlkCacheGetCacheEntryByOffset(PPDMBLKCACHE pBlkCache, uint64_t off) +{ + STAM_PROFILE_ADV_START(&pBlkCache->pCache->StatTreeGet, Cache); + + RTSemRWRequestRead(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + PPDMBLKCACHEENTRY pEntry = (PPDMBLKCACHEENTRY)RTAvlrU64RangeGet(pBlkCache->pTree, off); + if (pEntry) + pdmBlkCacheEntryRef(pEntry); + RTSemRWReleaseRead(pBlkCache->SemRWEntries); + + STAM_PROFILE_ADV_STOP(&pBlkCache->pCache->StatTreeGet, Cache); + + return pEntry; +} + +/** + * Return the best fit cache entries for the given offset. + * + * @returns nothing. + * @param pBlkCache The endpoint cache. + * @param off The offset. + * @param ppEntryAbove Where to store the pointer to the best fit entry above + * the given offset. NULL if not required. + */ +static void pdmBlkCacheGetCacheBestFitEntryByOffset(PPDMBLKCACHE pBlkCache, uint64_t off, PPDMBLKCACHEENTRY *ppEntryAbove) +{ + STAM_PROFILE_ADV_START(&pBlkCache->pCache->StatTreeGet, Cache); + + RTSemRWRequestRead(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + if (ppEntryAbove) + { + *ppEntryAbove = (PPDMBLKCACHEENTRY)RTAvlrU64GetBestFit(pBlkCache->pTree, off, true /*fAbove*/); + if (*ppEntryAbove) + pdmBlkCacheEntryRef(*ppEntryAbove); + } + + RTSemRWReleaseRead(pBlkCache->SemRWEntries); + + STAM_PROFILE_ADV_STOP(&pBlkCache->pCache->StatTreeGet, Cache); +} + +static void pdmBlkCacheInsertEntry(PPDMBLKCACHE pBlkCache, PPDMBLKCACHEENTRY pEntry) +{ + STAM_PROFILE_ADV_START(&pBlkCache->pCache->StatTreeInsert, Cache); + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + bool fInserted = RTAvlrU64Insert(pBlkCache->pTree, &pEntry->Core); + AssertMsg(fInserted, ("Node was not inserted into tree\n")); NOREF(fInserted); + STAM_PROFILE_ADV_STOP(&pBlkCache->pCache->StatTreeInsert, Cache); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); +} + +/** + * Allocates and initializes a new entry for the cache. + * The entry has a reference count of 1. + * + * @returns Pointer to the new cache entry or NULL if out of memory. + * @param pBlkCache The cache the entry belongs to. + * @param off Start offset. + * @param cbData Size of the cache entry. + * @param pbBuffer Pointer to the buffer to use. + * NULL if a new buffer should be allocated. + * The buffer needs to have the same size of the entry. + */ +static PPDMBLKCACHEENTRY pdmBlkCacheEntryAlloc(PPDMBLKCACHE pBlkCache, uint64_t off, size_t cbData, uint8_t *pbBuffer) +{ + AssertReturn(cbData <= UINT32_MAX, NULL); + PPDMBLKCACHEENTRY pEntryNew = (PPDMBLKCACHEENTRY)RTMemAllocZ(sizeof(PDMBLKCACHEENTRY)); + + if (RT_UNLIKELY(!pEntryNew)) + return NULL; + + pEntryNew->Core.Key = off; + pEntryNew->Core.KeyLast = off + cbData - 1; + pEntryNew->pBlkCache = pBlkCache; + pEntryNew->fFlags = 0; + pEntryNew->cRefs = 1; /* We are using it now. */ + pEntryNew->pList = NULL; + pEntryNew->cbData = (uint32_t)cbData; + pEntryNew->pWaitingHead = NULL; + pEntryNew->pWaitingTail = NULL; + if (pbBuffer) + pEntryNew->pbData = pbBuffer; + else + pEntryNew->pbData = (uint8_t *)RTMemPageAlloc(cbData); + + if (RT_UNLIKELY(!pEntryNew->pbData)) + { + RTMemFree(pEntryNew); + return NULL; + } + + return pEntryNew; +} + +/** + * Checks that a set of flags is set/clear acquiring the R/W semaphore + * in exclusive mode. + * + * @returns true if the flag in fSet is set and the one in fClear is clear. + * false otherwise. + * The R/W semaphore is only held if true is returned. + * + * @param pBlkCache The endpoint cache instance data. + * @param pEntry The entry to check the flags for. + * @param fSet The flag which is tested to be set. + * @param fClear The flag which is tested to be clear. + */ +DECLINLINE(bool) pdmBlkCacheEntryFlagIsSetClearAcquireLock(PPDMBLKCACHE pBlkCache, + PPDMBLKCACHEENTRY pEntry, + uint32_t fSet, uint32_t fClear) +{ + uint32_t fFlags = ASMAtomicReadU32(&pEntry->fFlags); + bool fPassed = ((fFlags & fSet) && !(fFlags & fClear)); + + if (fPassed) + { + /* Acquire the lock and check again because the completion callback might have raced us. */ + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + + fFlags = ASMAtomicReadU32(&pEntry->fFlags); + fPassed = ((fFlags & fSet) && !(fFlags & fClear)); + + /* Drop the lock if we didn't passed the test. */ + if (!fPassed) + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + } + + return fPassed; +} + +/** + * Adds a segment to the waiting list for a cache entry + * which is currently in progress. + * + * @returns nothing. + * @param pEntry The cache entry to add the segment to. + * @param pWaiter The waiter entry to add. + */ +DECLINLINE(void) pdmBlkCacheEntryAddWaiter(PPDMBLKCACHEENTRY pEntry, + PPDMBLKCACHEWAITER pWaiter) +{ + pWaiter->pNext = NULL; + + if (pEntry->pWaitingHead) + { + AssertPtr(pEntry->pWaitingTail); + + pEntry->pWaitingTail->pNext = pWaiter; + pEntry->pWaitingTail = pWaiter; + } + else + { + Assert(!pEntry->pWaitingTail); + + pEntry->pWaitingHead = pWaiter; + pEntry->pWaitingTail = pWaiter; + } +} + +/** + * Add a buffer described by the I/O memory context + * to the entry waiting for completion. + * + * @returns VBox status code. + * @param pEntry The entry to add the buffer to. + * @param pReq The request. + * @param pSgBuf The scatter/gather buffer. Will be advanced by cbData. + * @param offDiff Offset from the start of the buffer in the entry. + * @param cbData Amount of data to wait for onthis entry. + * @param fWrite Flag whether the task waits because it wants to write to + * the cache entry. + */ +static int pdmBlkCacheEntryWaitersAdd(PPDMBLKCACHEENTRY pEntry, PPDMBLKCACHEREQ pReq, + PRTSGBUF pSgBuf, uint64_t offDiff, size_t cbData, bool fWrite) +{ + PPDMBLKCACHEWAITER pWaiter = (PPDMBLKCACHEWAITER)RTMemAllocZ(sizeof(PDMBLKCACHEWAITER)); + if (!pWaiter) + return VERR_NO_MEMORY; + + ASMAtomicIncU32(&pReq->cXfersPending); + pWaiter->pReq = pReq; + pWaiter->offCacheEntry = offDiff; + pWaiter->cbTransfer = cbData; + pWaiter->fWrite = fWrite; + RTSgBufClone(&pWaiter->SgBuf, pSgBuf); + RTSgBufAdvance(pSgBuf, cbData); + + pdmBlkCacheEntryAddWaiter(pEntry, pWaiter); + + return VINF_SUCCESS; +} + +/** + * Calculate aligned offset and size for a new cache entry which do not + * intersect with an already existing entry and the file end. + * + * @returns The number of bytes the entry can hold of the requested amount + * of bytes. + * @param pBlkCache The endpoint cache. + * @param off The start offset. + * @param cb The number of bytes the entry needs to hold at + * least. + * @param pcbEntry Where to store the number of bytes the entry can hold. + * Can be less than given because of other entries. + */ +static uint32_t pdmBlkCacheEntryBoundariesCalc(PPDMBLKCACHE pBlkCache, + uint64_t off, uint32_t cb, + uint32_t *pcbEntry) +{ + /* Get the best fit entries around the offset */ + PPDMBLKCACHEENTRY pEntryAbove = NULL; + pdmBlkCacheGetCacheBestFitEntryByOffset(pBlkCache, off, &pEntryAbove); + + /* Log the info */ + LogFlow(("%sest fit entry above off=%llu (BestFit=%llu BestFitEnd=%llu BestFitSize=%u)\n", + pEntryAbove ? "B" : "No b", + off, + pEntryAbove ? pEntryAbove->Core.Key : 0, + pEntryAbove ? pEntryAbove->Core.KeyLast : 0, + pEntryAbove ? pEntryAbove->cbData : 0)); + + uint32_t cbNext; + uint32_t cbInEntry; + if ( pEntryAbove + && off + cb > pEntryAbove->Core.Key) + { + cbInEntry = (uint32_t)(pEntryAbove->Core.Key - off); + cbNext = (uint32_t)(pEntryAbove->Core.Key - off); + } + else + { + cbInEntry = cb; + cbNext = cb; + } + + /* A few sanity checks */ + AssertMsg(!pEntryAbove || off + cbNext <= pEntryAbove->Core.Key, + ("Aligned size intersects with another cache entry\n")); + Assert(cbInEntry <= cbNext); + + if (pEntryAbove) + pdmBlkCacheEntryRelease(pEntryAbove); + + LogFlow(("off=%llu cbNext=%u\n", off, cbNext)); + + *pcbEntry = cbNext; + + return cbInEntry; +} + +/** + * Create a new cache entry evicting data from the cache if required. + * + * @returns Pointer to the new cache entry or NULL + * if not enough bytes could be evicted from the cache. + * @param pBlkCache The endpoint cache. + * @param off The offset. + * @param cb Number of bytes the cache entry should have. + * @param pcbData Where to store the number of bytes the new + * entry can hold. May be lower than actually + * requested due to another entry intersecting the + * access range. + */ +static PPDMBLKCACHEENTRY pdmBlkCacheEntryCreate(PPDMBLKCACHE pBlkCache, uint64_t off, size_t cb, size_t *pcbData) +{ + uint32_t cbEntry = 0; + + *pcbData = pdmBlkCacheEntryBoundariesCalc(pBlkCache, off, (uint32_t)cb, &cbEntry); + AssertReturn(cb <= UINT32_MAX, NULL); + + PPDMBLKCACHEGLOBAL pCache = pBlkCache->pCache; + pdmBlkCacheLockEnter(pCache); + + PPDMBLKCACHEENTRY pEntryNew = NULL; + uint8_t *pbBuffer = NULL; + bool fEnough = pdmBlkCacheReclaim(pCache, cbEntry, true, &pbBuffer); + if (fEnough) + { + LogFlow(("Evicted enough bytes (%u requested). Creating new cache entry\n", cbEntry)); + + pEntryNew = pdmBlkCacheEntryAlloc(pBlkCache, off, cbEntry, pbBuffer); + if (RT_LIKELY(pEntryNew)) + { + pdmBlkCacheEntryAddToList(&pCache->LruRecentlyUsedIn, pEntryNew); + pdmBlkCacheAdd(pCache, cbEntry); + pdmBlkCacheLockLeave(pCache); + + pdmBlkCacheInsertEntry(pBlkCache, pEntryNew); + + AssertMsg( (off >= pEntryNew->Core.Key) + && (off + *pcbData <= pEntryNew->Core.KeyLast + 1), + ("Overflow in calculation off=%llu\n", off)); + } + else + pdmBlkCacheLockLeave(pCache); + } + else + pdmBlkCacheLockLeave(pCache); + + return pEntryNew; +} + +static PPDMBLKCACHEREQ pdmBlkCacheReqAlloc(void *pvUser) +{ + PPDMBLKCACHEREQ pReq = (PPDMBLKCACHEREQ)RTMemAlloc(sizeof(PDMBLKCACHEREQ)); + + if (RT_LIKELY(pReq)) + { + pReq->pvUser = pvUser; + pReq->rcReq = VINF_SUCCESS; + pReq->cXfersPending = 0; + } + + return pReq; +} + +static void pdmBlkCacheReqComplete(PPDMBLKCACHE pBlkCache, PPDMBLKCACHEREQ pReq) +{ + switch (pBlkCache->enmType) + { + case PDMBLKCACHETYPE_DEV: + { + pBlkCache->u.Dev.pfnXferComplete(pBlkCache->u.Dev.pDevIns, + pReq->pvUser, pReq->rcReq); + break; + } + case PDMBLKCACHETYPE_DRV: + { + pBlkCache->u.Drv.pfnXferComplete(pBlkCache->u.Drv.pDrvIns, + pReq->pvUser, pReq->rcReq); + break; + } + case PDMBLKCACHETYPE_USB: + { + pBlkCache->u.Usb.pfnXferComplete(pBlkCache->u.Usb.pUsbIns, + pReq->pvUser, pReq->rcReq); + break; + } + case PDMBLKCACHETYPE_INTERNAL: + { + pBlkCache->u.Int.pfnXferComplete(pBlkCache->u.Int.pvUser, + pReq->pvUser, pReq->rcReq); + break; + } + default: + AssertMsgFailed(("Unknown block cache type!\n")); + } + + RTMemFree(pReq); +} + +static bool pdmBlkCacheReqUpdate(PPDMBLKCACHE pBlkCache, PPDMBLKCACHEREQ pReq, + int rcReq, bool fCallHandler) +{ + if (RT_FAILURE(rcReq)) + ASMAtomicCmpXchgS32(&pReq->rcReq, rcReq, VINF_SUCCESS); + + AssertMsg(pReq->cXfersPending > 0, ("No transfers are pending for this request\n")); + uint32_t cXfersPending = ASMAtomicDecU32(&pReq->cXfersPending); + + if (!cXfersPending) + { + if (fCallHandler) + pdmBlkCacheReqComplete(pBlkCache, pReq); + return true; + } + + LogFlowFunc(("pReq=%#p cXfersPending=%u\n", pReq, cXfersPending)); + return false; +} + +VMMR3DECL(int) PDMR3BlkCacheRead(PPDMBLKCACHE pBlkCache, uint64_t off, + PCRTSGBUF pSgBuf, size_t cbRead, void *pvUser) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHEGLOBAL pCache = pBlkCache->pCache; + PPDMBLKCACHEENTRY pEntry; + PPDMBLKCACHEREQ pReq; + + LogFlowFunc((": pBlkCache=%#p{%s} off=%llu pSgBuf=%#p cbRead=%u pvUser=%#p\n", + pBlkCache, pBlkCache->pszId, off, pSgBuf, cbRead, pvUser)); + + AssertPtrReturn(pBlkCache, VERR_INVALID_POINTER); + AssertReturn(!pBlkCache->fSuspended, VERR_INVALID_STATE); + + RTSGBUF SgBuf; + RTSgBufClone(&SgBuf, pSgBuf); + + /* Allocate new request structure. */ + pReq = pdmBlkCacheReqAlloc(pvUser); + if (RT_UNLIKELY(!pReq)) + return VERR_NO_MEMORY; + + /* Increment data transfer counter to keep the request valid while we access it. */ + ASMAtomicIncU32(&pReq->cXfersPending); + + while (cbRead) + { + size_t cbToRead; + + pEntry = pdmBlkCacheGetCacheEntryByOffset(pBlkCache, off); + + /* + * If there is no entry we try to create a new one eviciting unused pages + * if the cache is full. If this is not possible we will pass the request through + * and skip the caching (all entries may be still in progress so they can't + * be evicted) + * If we have an entry it can be in one of the LRU lists where the entry + * contains data (recently used or frequently used LRU) so we can just read + * the data we need and put the entry at the head of the frequently used LRU list. + * In case the entry is in one of the ghost lists it doesn't contain any data. + * We have to fetch it again evicting pages from either T1 or T2 to make room. + */ + if (pEntry) + { + uint64_t offDiff = off - pEntry->Core.Key; + + AssertMsg(off >= pEntry->Core.Key, + ("Overflow in calculation off=%llu OffsetAligned=%llu\n", + off, pEntry->Core.Key)); + + AssertPtr(pEntry->pList); + + cbToRead = RT_MIN(pEntry->cbData - offDiff, cbRead); + + AssertMsg(off + cbToRead <= pEntry->Core.Key + pEntry->Core.KeyLast + 1, + ("Buffer of cache entry exceeded off=%llu cbToRead=%d\n", + off, cbToRead)); + + cbRead -= cbToRead; + + if (!cbRead) + STAM_COUNTER_INC(&pCache->cHits); + else + STAM_COUNTER_INC(&pCache->cPartialHits); + + STAM_COUNTER_ADD(&pCache->StatRead, cbToRead); + + /* Ghost lists contain no data. */ + if ( (pEntry->pList == &pCache->LruRecentlyUsedIn) + || (pEntry->pList == &pCache->LruFrequentlyUsed)) + { + if (pdmBlkCacheEntryFlagIsSetClearAcquireLock(pBlkCache, pEntry, + PDMBLKCACHE_ENTRY_IO_IN_PROGRESS, + PDMBLKCACHE_ENTRY_IS_DIRTY)) + { + /* Entry didn't completed yet. Append to the list */ + pdmBlkCacheEntryWaitersAdd(pEntry, pReq, + &SgBuf, offDiff, cbToRead, + false /* fWrite */); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + } + else + { + /* Read as much as we can from the entry. */ + RTSgBufCopyFromBuf(&SgBuf, pEntry->pbData + offDiff, cbToRead); + } + + /* Move this entry to the top position */ + if (pEntry->pList == &pCache->LruFrequentlyUsed) + { + pdmBlkCacheLockEnter(pCache); + pdmBlkCacheEntryAddToList(&pCache->LruFrequentlyUsed, pEntry); + pdmBlkCacheLockLeave(pCache); + } + /* Release the entry */ + pdmBlkCacheEntryRelease(pEntry); + } + else + { + uint8_t *pbBuffer = NULL; + + LogFlow(("Fetching data for ghost entry %#p from file\n", pEntry)); + + pdmBlkCacheLockEnter(pCache); + pdmBlkCacheEntryRemoveFromList(pEntry); /* Remove it before we remove data, otherwise it may get freed when evicting data. */ + bool fEnough = pdmBlkCacheReclaim(pCache, pEntry->cbData, true, &pbBuffer); + + /* Move the entry to Am and fetch it to the cache. */ + if (fEnough) + { + pdmBlkCacheEntryAddToList(&pCache->LruFrequentlyUsed, pEntry); + pdmBlkCacheAdd(pCache, pEntry->cbData); + pdmBlkCacheLockLeave(pCache); + + if (pbBuffer) + pEntry->pbData = pbBuffer; + else + pEntry->pbData = (uint8_t *)RTMemPageAlloc(pEntry->cbData); + AssertPtr(pEntry->pbData); + + pdmBlkCacheEntryWaitersAdd(pEntry, pReq, + &SgBuf, offDiff, cbToRead, + false /* fWrite */); + pdmBlkCacheEntryReadFromMedium(pEntry); + /* Release the entry */ + pdmBlkCacheEntryRelease(pEntry); + } + else + { + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + STAM_PROFILE_ADV_START(&pCache->StatTreeRemove, Cache); + RTAvlrU64Remove(pBlkCache->pTree, pEntry->Core.Key); + STAM_PROFILE_ADV_STOP(&pCache->StatTreeRemove, Cache); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + pdmBlkCacheLockLeave(pCache); + + RTMemFree(pEntry); + + pdmBlkCacheRequestPassthrough(pBlkCache, pReq, + &SgBuf, off, cbToRead, + PDMBLKCACHEXFERDIR_READ); + } + } + } + else + { +#ifdef VBOX_WITH_IO_READ_CACHE + /* No entry found for this offset. Create a new entry and fetch the data to the cache. */ + PPDMBLKCACHEENTRY pEntryNew = pdmBlkCacheEntryCreate(pBlkCache, + off, cbRead, + &cbToRead); + + cbRead -= cbToRead; + + if (pEntryNew) + { + if (!cbRead) + STAM_COUNTER_INC(&pCache->cMisses); + else + STAM_COUNTER_INC(&pCache->cPartialHits); + + pdmBlkCacheEntryWaitersAdd(pEntryNew, pReq, + &SgBuf, + off - pEntryNew->Core.Key, + cbToRead, + false /* fWrite */); + pdmBlkCacheEntryReadFromMedium(pEntryNew); + pdmBlkCacheEntryRelease(pEntryNew); /* it is protected by the I/O in progress flag now. */ + } + else + { + /* + * There is not enough free space in the cache. + * Pass the request directly to the I/O manager. + */ + LogFlow(("Couldn't evict %u bytes from the cache. Remaining request will be passed through\n", cbToRead)); + + pdmBlkCacheRequestPassthrough(pBlkCache, pReq, + &SgBuf, off, cbToRead, + PDMBLKCACHEXFERDIR_READ); + } +#else + /* Clip read size if necessary. */ + PPDMBLKCACHEENTRY pEntryAbove; + pdmBlkCacheGetCacheBestFitEntryByOffset(pBlkCache, off, &pEntryAbove); + + if (pEntryAbove) + { + if (off + cbRead > pEntryAbove->Core.Key) + cbToRead = pEntryAbove->Core.Key - off; + else + cbToRead = cbRead; + + pdmBlkCacheEntryRelease(pEntryAbove); + } + else + cbToRead = cbRead; + + cbRead -= cbToRead; + pdmBlkCacheRequestPassthrough(pBlkCache, pReq, + &SgBuf, off, cbToRead, + PDMBLKCACHEXFERDIR_READ); +#endif + } + off += cbToRead; + } + + if (!pdmBlkCacheReqUpdate(pBlkCache, pReq, rc, false)) + rc = VINF_AIO_TASK_PENDING; + else + { + rc = pReq->rcReq; + RTMemFree(pReq); + } + + LogFlowFunc((": Leave rc=%Rrc\n", rc)); + + return rc; +} + +VMMR3DECL(int) PDMR3BlkCacheWrite(PPDMBLKCACHE pBlkCache, uint64_t off, PCRTSGBUF pSgBuf, size_t cbWrite, void *pvUser) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHEGLOBAL pCache = pBlkCache->pCache; + PPDMBLKCACHEENTRY pEntry; + PPDMBLKCACHEREQ pReq; + + LogFlowFunc((": pBlkCache=%#p{%s} off=%llu pSgBuf=%#p cbWrite=%u pvUser=%#p\n", + pBlkCache, pBlkCache->pszId, off, pSgBuf, cbWrite, pvUser)); + + AssertPtrReturn(pBlkCache, VERR_INVALID_POINTER); + AssertReturn(!pBlkCache->fSuspended, VERR_INVALID_STATE); + + RTSGBUF SgBuf; + RTSgBufClone(&SgBuf, pSgBuf); + + /* Allocate new request structure. */ + pReq = pdmBlkCacheReqAlloc(pvUser); + if (RT_UNLIKELY(!pReq)) + return VERR_NO_MEMORY; + + /* Increment data transfer counter to keep the request valid while we access it. */ + ASMAtomicIncU32(&pReq->cXfersPending); + + while (cbWrite) + { + size_t cbToWrite; + + pEntry = pdmBlkCacheGetCacheEntryByOffset(pBlkCache, off); + if (pEntry) + { + /* Write the data into the entry and mark it as dirty */ + AssertPtr(pEntry->pList); + + uint64_t offDiff = off - pEntry->Core.Key; + AssertMsg(off >= pEntry->Core.Key, ("Overflow in calculation off=%llu OffsetAligned=%llu\n", off, pEntry->Core.Key)); + + cbToWrite = RT_MIN(pEntry->cbData - offDiff, cbWrite); + cbWrite -= cbToWrite; + + if (!cbWrite) + STAM_COUNTER_INC(&pCache->cHits); + else + STAM_COUNTER_INC(&pCache->cPartialHits); + + STAM_COUNTER_ADD(&pCache->StatWritten, cbToWrite); + + /* Ghost lists contain no data. */ + if ( (pEntry->pList == &pCache->LruRecentlyUsedIn) + || (pEntry->pList == &pCache->LruFrequentlyUsed)) + { + /* Check if the entry is dirty. */ + if (pdmBlkCacheEntryFlagIsSetClearAcquireLock(pBlkCache, pEntry, + PDMBLKCACHE_ENTRY_IS_DIRTY, + 0)) + { + /* If it is already dirty but not in progress just update the data. */ + if (!(pEntry->fFlags & PDMBLKCACHE_ENTRY_IO_IN_PROGRESS)) + RTSgBufCopyToBuf(&SgBuf, pEntry->pbData + offDiff, cbToWrite); + else + { + /* The data isn't written to the file yet */ + pdmBlkCacheEntryWaitersAdd(pEntry, pReq, + &SgBuf, offDiff, cbToWrite, + true /* fWrite */); + STAM_COUNTER_INC(&pBlkCache->StatWriteDeferred); + } + + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + } + else /* Dirty bit not set */ + { + /* + * Check if a read is in progress for this entry. + * We have to defer processing in that case. + */ + if (pdmBlkCacheEntryFlagIsSetClearAcquireLock(pBlkCache, pEntry, + PDMBLKCACHE_ENTRY_IO_IN_PROGRESS, + 0)) + { + pdmBlkCacheEntryWaitersAdd(pEntry, pReq, + &SgBuf, offDiff, cbToWrite, + true /* fWrite */); + STAM_COUNTER_INC(&pBlkCache->StatWriteDeferred); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + } + else /* I/O in progress flag not set */ + { + /* Write as much as we can into the entry and update the file. */ + RTSgBufCopyToBuf(&SgBuf, pEntry->pbData + offDiff, cbToWrite); + + bool fCommit = pdmBlkCacheAddDirtyEntry(pBlkCache, pEntry); + if (fCommit) + pdmBlkCacheCommitDirtyEntries(pCache); + } + } /* Dirty bit not set */ + + /* Move this entry to the top position */ + if (pEntry->pList == &pCache->LruFrequentlyUsed) + { + pdmBlkCacheLockEnter(pCache); + pdmBlkCacheEntryAddToList(&pCache->LruFrequentlyUsed, pEntry); + pdmBlkCacheLockLeave(pCache); + } + + pdmBlkCacheEntryRelease(pEntry); + } + else /* Entry is on the ghost list */ + { + uint8_t *pbBuffer = NULL; + + pdmBlkCacheLockEnter(pCache); + pdmBlkCacheEntryRemoveFromList(pEntry); /* Remove it before we remove data, otherwise it may get freed when evicting data. */ + bool fEnough = pdmBlkCacheReclaim(pCache, pEntry->cbData, true, &pbBuffer); + + if (fEnough) + { + /* Move the entry to Am and fetch it to the cache. */ + pdmBlkCacheEntryAddToList(&pCache->LruFrequentlyUsed, pEntry); + pdmBlkCacheAdd(pCache, pEntry->cbData); + pdmBlkCacheLockLeave(pCache); + + if (pbBuffer) + pEntry->pbData = pbBuffer; + else + pEntry->pbData = (uint8_t *)RTMemPageAlloc(pEntry->cbData); + AssertPtr(pEntry->pbData); + + pdmBlkCacheEntryWaitersAdd(pEntry, pReq, + &SgBuf, offDiff, cbToWrite, + true /* fWrite */); + STAM_COUNTER_INC(&pBlkCache->StatWriteDeferred); + pdmBlkCacheEntryReadFromMedium(pEntry); + + /* Release the reference. If it is still needed the I/O in progress flag should protect it now. */ + pdmBlkCacheEntryRelease(pEntry); + } + else + { + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + STAM_PROFILE_ADV_START(&pCache->StatTreeRemove, Cache); + RTAvlrU64Remove(pBlkCache->pTree, pEntry->Core.Key); + STAM_PROFILE_ADV_STOP(&pCache->StatTreeRemove, Cache); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + pdmBlkCacheLockLeave(pCache); + + RTMemFree(pEntry); + pdmBlkCacheRequestPassthrough(pBlkCache, pReq, + &SgBuf, off, cbToWrite, + PDMBLKCACHEXFERDIR_WRITE); + } + } + } + else /* No entry found */ + { + /* + * No entry found. Try to create a new cache entry to store the data in and if that fails + * write directly to the file. + */ + PPDMBLKCACHEENTRY pEntryNew = pdmBlkCacheEntryCreate(pBlkCache, + off, cbWrite, + &cbToWrite); + + cbWrite -= cbToWrite; + + if (pEntryNew) + { + uint64_t offDiff = off - pEntryNew->Core.Key; + + STAM_COUNTER_INC(&pCache->cHits); + + /* + * Check if it is possible to just write the data without waiting + * for it to get fetched first. + */ + if (!offDiff && pEntryNew->cbData == cbToWrite) + { + RTSgBufCopyToBuf(&SgBuf, pEntryNew->pbData, cbToWrite); + + bool fCommit = pdmBlkCacheAddDirtyEntry(pBlkCache, pEntryNew); + if (fCommit) + pdmBlkCacheCommitDirtyEntries(pCache); + STAM_COUNTER_ADD(&pCache->StatWritten, cbToWrite); + } + else + { + /* Defer the write and fetch the data from the endpoint. */ + pdmBlkCacheEntryWaitersAdd(pEntryNew, pReq, + &SgBuf, offDiff, cbToWrite, + true /* fWrite */); + STAM_COUNTER_INC(&pBlkCache->StatWriteDeferred); + pdmBlkCacheEntryReadFromMedium(pEntryNew); + } + + pdmBlkCacheEntryRelease(pEntryNew); + } + else + { + /* + * There is not enough free space in the cache. + * Pass the request directly to the I/O manager. + */ + LogFlow(("Couldn't evict %u bytes from the cache. Remaining request will be passed through\n", cbToWrite)); + + STAM_COUNTER_INC(&pCache->cMisses); + + pdmBlkCacheRequestPassthrough(pBlkCache, pReq, + &SgBuf, off, cbToWrite, + PDMBLKCACHEXFERDIR_WRITE); + } + } + + off += cbToWrite; + } + + if (!pdmBlkCacheReqUpdate(pBlkCache, pReq, rc, false)) + rc = VINF_AIO_TASK_PENDING; + else + { + rc = pReq->rcReq; + RTMemFree(pReq); + } + + LogFlowFunc((": Leave rc=%Rrc\n", rc)); + + return rc; +} + +VMMR3DECL(int) PDMR3BlkCacheFlush(PPDMBLKCACHE pBlkCache, void *pvUser) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHEREQ pReq; + + LogFlowFunc((": pBlkCache=%#p{%s}\n", pBlkCache, pBlkCache->pszId)); + + AssertPtrReturn(pBlkCache, VERR_INVALID_POINTER); + AssertReturn(!pBlkCache->fSuspended, VERR_INVALID_STATE); + + /* Commit dirty entries in the cache. */ + pdmBlkCacheCommit(pBlkCache); + + /* Allocate new request structure. */ + pReq = pdmBlkCacheReqAlloc(pvUser); + if (RT_UNLIKELY(!pReq)) + return VERR_NO_MEMORY; + + rc = pdmBlkCacheRequestPassthrough(pBlkCache, pReq, NULL, 0, 0, + PDMBLKCACHEXFERDIR_FLUSH); + AssertRC(rc); + + LogFlowFunc((": Leave rc=%Rrc\n", rc)); + return VINF_AIO_TASK_PENDING; +} + +VMMR3DECL(int) PDMR3BlkCacheDiscard(PPDMBLKCACHE pBlkCache, PCRTRANGE paRanges, + unsigned cRanges, void *pvUser) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHEGLOBAL pCache = pBlkCache->pCache; + PPDMBLKCACHEENTRY pEntry; + PPDMBLKCACHEREQ pReq; + + LogFlowFunc((": pBlkCache=%#p{%s} paRanges=%#p cRanges=%u pvUser=%#p\n", + pBlkCache, pBlkCache->pszId, paRanges, cRanges, pvUser)); + + AssertPtrReturn(pBlkCache, VERR_INVALID_POINTER); + AssertReturn(!pBlkCache->fSuspended, VERR_INVALID_STATE); + + /* Allocate new request structure. */ + pReq = pdmBlkCacheReqAlloc(pvUser); + if (RT_UNLIKELY(!pReq)) + return VERR_NO_MEMORY; + + /* Increment data transfer counter to keep the request valid while we access it. */ + ASMAtomicIncU32(&pReq->cXfersPending); + + for (unsigned i = 0; i < cRanges; i++) + { + uint64_t offCur = paRanges[i].offStart; + size_t cbLeft = paRanges[i].cbRange; + + while (cbLeft) + { + size_t cbThisDiscard = 0; + + pEntry = pdmBlkCacheGetCacheEntryByOffset(pBlkCache, offCur); + + if (pEntry) + { + /* Write the data into the entry and mark it as dirty */ + AssertPtr(pEntry->pList); + + uint64_t offDiff = offCur - pEntry->Core.Key; + + AssertMsg(offCur >= pEntry->Core.Key, + ("Overflow in calculation offCur=%llu OffsetAligned=%llu\n", + offCur, pEntry->Core.Key)); + + cbThisDiscard = RT_MIN(pEntry->cbData - offDiff, cbLeft); + + /* Ghost lists contain no data. */ + if ( (pEntry->pList == &pCache->LruRecentlyUsedIn) + || (pEntry->pList == &pCache->LruFrequentlyUsed)) + { + /* Check if the entry is dirty. */ + if (pdmBlkCacheEntryFlagIsSetClearAcquireLock(pBlkCache, pEntry, + PDMBLKCACHE_ENTRY_IS_DIRTY, + 0)) + { + /* If it is dirty but not yet in progress remove it. */ + if (!(pEntry->fFlags & PDMBLKCACHE_ENTRY_IO_IN_PROGRESS)) + { + pdmBlkCacheLockEnter(pCache); + pdmBlkCacheEntryRemoveFromList(pEntry); + + STAM_PROFILE_ADV_START(&pCache->StatTreeRemove, Cache); + RTAvlrU64Remove(pBlkCache->pTree, pEntry->Core.Key); + STAM_PROFILE_ADV_STOP(&pCache->StatTreeRemove, Cache); + + pdmBlkCacheLockLeave(pCache); + + RTMemFree(pEntry); + } + else + { +#if 0 + /* The data isn't written to the file yet */ + pdmBlkCacheEntryWaitersAdd(pEntry, pReq, + &SgBuf, offDiff, cbToWrite, + true /* fWrite */); + STAM_COUNTER_INC(&pBlkCache->StatWriteDeferred); +#endif + } + + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + pdmBlkCacheEntryRelease(pEntry); + } + else /* Dirty bit not set */ + { + /* + * Check if a read is in progress for this entry. + * We have to defer processing in that case. + */ + if(pdmBlkCacheEntryFlagIsSetClearAcquireLock(pBlkCache, pEntry, + PDMBLKCACHE_ENTRY_IO_IN_PROGRESS, + 0)) + { +#if 0 + pdmBlkCacheEntryWaitersAdd(pEntry, pReq, + &SgBuf, offDiff, cbToWrite, + true /* fWrite */); +#endif + STAM_COUNTER_INC(&pBlkCache->StatWriteDeferred); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + pdmBlkCacheEntryRelease(pEntry); + } + else /* I/O in progress flag not set */ + { + pdmBlkCacheLockEnter(pCache); + pdmBlkCacheEntryRemoveFromList(pEntry); + + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + STAM_PROFILE_ADV_START(&pCache->StatTreeRemove, Cache); + RTAvlrU64Remove(pBlkCache->pTree, pEntry->Core.Key); + STAM_PROFILE_ADV_STOP(&pCache->StatTreeRemove, Cache); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + pdmBlkCacheLockLeave(pCache); + + RTMemFree(pEntry); + } + } /* Dirty bit not set */ + } + else /* Entry is on the ghost list just remove cache entry. */ + { + pdmBlkCacheLockEnter(pCache); + pdmBlkCacheEntryRemoveFromList(pEntry); + + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + STAM_PROFILE_ADV_START(&pCache->StatTreeRemove, Cache); + RTAvlrU64Remove(pBlkCache->pTree, pEntry->Core.Key); + STAM_PROFILE_ADV_STOP(&pCache->StatTreeRemove, Cache); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + pdmBlkCacheLockLeave(pCache); + + RTMemFree(pEntry); + } + } + /* else: no entry found. */ + + offCur += cbThisDiscard; + cbLeft -= cbThisDiscard; + } + } + + if (!pdmBlkCacheReqUpdate(pBlkCache, pReq, rc, false)) + rc = VINF_AIO_TASK_PENDING; + else + { + rc = pReq->rcReq; + RTMemFree(pReq); + } + + LogFlowFunc((": Leave rc=%Rrc\n", rc)); + + return rc; +} + +/** + * Completes a task segment freeing all resources and completes the task handle + * if everything was transferred. + * + * @returns Next task segment handle. + * @param pBlkCache The endpoint block cache. + * @param pWaiter Task segment to complete. + * @param rc Status code to set. + */ +static PPDMBLKCACHEWAITER pdmBlkCacheWaiterComplete(PPDMBLKCACHE pBlkCache, PPDMBLKCACHEWAITER pWaiter, int rc) +{ + PPDMBLKCACHEWAITER pNext = pWaiter->pNext; + PPDMBLKCACHEREQ pReq = pWaiter->pReq; + + pdmBlkCacheReqUpdate(pBlkCache, pReq, rc, true); + + RTMemFree(pWaiter); + + return pNext; +} + +static void pdmBlkCacheIoXferCompleteEntry(PPDMBLKCACHE pBlkCache, PPDMBLKCACHEIOXFER hIoXfer, int rcIoXfer) +{ + PPDMBLKCACHEENTRY pEntry = hIoXfer->pEntry; + PPDMBLKCACHEGLOBAL pCache = pBlkCache->pCache; + + /* Reference the entry now as we are clearing the I/O in progress flag + * which protected the entry till now. */ + pdmBlkCacheEntryRef(pEntry); + + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + pEntry->fFlags &= ~PDMBLKCACHE_ENTRY_IO_IN_PROGRESS; + + /* Process waiting segment list. The data in entry might have changed in-between. */ + bool fDirty = false; + PPDMBLKCACHEWAITER pComplete = pEntry->pWaitingHead; + PPDMBLKCACHEWAITER pCurr = pComplete; + + AssertMsg((pCurr && pEntry->pWaitingTail) || (!pCurr && !pEntry->pWaitingTail), + ("The list tail was not updated correctly\n")); + pEntry->pWaitingTail = NULL; + pEntry->pWaitingHead = NULL; + + if (hIoXfer->enmXferDir == PDMBLKCACHEXFERDIR_WRITE) + { + /* + * An error here is difficult to handle as the original request completed already. + * The error is logged for now and the VM is paused. + * If the user continues the entry is written again in the hope + * the user fixed the problem and the next write succeeds. + */ + if (RT_FAILURE(rcIoXfer)) + { + LogRel(("I/O cache: Error while writing entry at offset %llu (%u bytes) to medium \"%s\" (rc=%Rrc)\n", + pEntry->Core.Key, pEntry->cbData, pBlkCache->pszId, rcIoXfer)); + + if (!ASMAtomicXchgBool(&pCache->fIoErrorVmSuspended, true)) + { + int rc = VMSetRuntimeError(pCache->pVM, VMSETRTERR_FLAGS_SUSPEND | VMSETRTERR_FLAGS_NO_WAIT, "BLKCACHE_IOERR", + N_("The I/O cache encountered an error while updating data in medium \"%s\" (rc=%Rrc). " + "Make sure there is enough free space on the disk and that the disk is working properly. " + "Operation can be resumed afterwards"), + pBlkCache->pszId, rcIoXfer); + AssertRC(rc); + } + + /* Mark the entry as dirty again to get it added to the list later on. */ + fDirty = true; + } + + pEntry->fFlags &= ~PDMBLKCACHE_ENTRY_IS_DIRTY; + + while (pCurr) + { + AssertMsg(pCurr->fWrite, ("Completed write entries should never have read tasks attached\n")); + + RTSgBufCopyToBuf(&pCurr->SgBuf, pEntry->pbData + pCurr->offCacheEntry, pCurr->cbTransfer); + fDirty = true; + pCurr = pCurr->pNext; + } + } + else + { + AssertMsg(hIoXfer->enmXferDir == PDMBLKCACHEXFERDIR_READ, ("Invalid transfer type\n")); + AssertMsg(!(pEntry->fFlags & PDMBLKCACHE_ENTRY_IS_DIRTY), + ("Invalid flags set\n")); + + while (pCurr) + { + if (pCurr->fWrite) + { + RTSgBufCopyToBuf(&pCurr->SgBuf, pEntry->pbData + pCurr->offCacheEntry, pCurr->cbTransfer); + fDirty = true; + } + else + RTSgBufCopyFromBuf(&pCurr->SgBuf, pEntry->pbData + pCurr->offCacheEntry, pCurr->cbTransfer); + + pCurr = pCurr->pNext; + } + } + + bool fCommit = false; + if (fDirty) + fCommit = pdmBlkCacheAddDirtyEntry(pBlkCache, pEntry); + + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + /* Dereference so that it isn't protected anymore except we issued anyother write for it. */ + pdmBlkCacheEntryRelease(pEntry); + + if (fCommit) + pdmBlkCacheCommitDirtyEntries(pCache); + + /* Complete waiters now. */ + while (pComplete) + pComplete = pdmBlkCacheWaiterComplete(pBlkCache, pComplete, rcIoXfer); +} + +VMMR3DECL(void) PDMR3BlkCacheIoXferComplete(PPDMBLKCACHE pBlkCache, PPDMBLKCACHEIOXFER hIoXfer, int rcIoXfer) +{ + LogFlowFunc(("pBlkCache=%#p hIoXfer=%#p rcIoXfer=%Rrc\n", pBlkCache, hIoXfer, rcIoXfer)); + + if (hIoXfer->fIoCache) + pdmBlkCacheIoXferCompleteEntry(pBlkCache, hIoXfer, rcIoXfer); + else + pdmBlkCacheReqUpdate(pBlkCache, hIoXfer->pReq, rcIoXfer, true); + + ASMAtomicDecU32(&pBlkCache->cIoXfersActive); + pdmBlkCacheR3TraceMsgF(pBlkCache, "BlkCache: I/O req %#p (%RTbool) completed (%u now active)", + hIoXfer, hIoXfer->fIoCache, pBlkCache->cIoXfersActive); + RTMemFree(hIoXfer); +} + +/** + * Callback for the AVL do with all routine. Waits for a cachen entry to finish any pending I/O. + * + * @returns IPRT status code. + * @param pNode The node to destroy. + * @param pvUser Opaque user data. + */ +static DECLCALLBACK(int) pdmBlkCacheEntryQuiesce(PAVLRU64NODECORE pNode, void *pvUser) +{ + PPDMBLKCACHEENTRY pEntry = (PPDMBLKCACHEENTRY)pNode; + PPDMBLKCACHE pBlkCache = pEntry->pBlkCache; + NOREF(pvUser); + + while (ASMAtomicReadU32(&pEntry->fFlags) & PDMBLKCACHE_ENTRY_IO_IN_PROGRESS) + { + /* Leave the locks to let the I/O thread make progress but reference the entry to prevent eviction. */ + pdmBlkCacheEntryRef(pEntry); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + RTThreadSleep(1); + + /* Re-enter all locks and drop the reference. */ + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + pdmBlkCacheEntryRelease(pEntry); + } + + AssertMsg(!(pEntry->fFlags & PDMBLKCACHE_ENTRY_IO_IN_PROGRESS), + ("Entry is dirty and/or still in progress fFlags=%#x\n", pEntry->fFlags)); + + return VINF_SUCCESS; +} + +VMMR3DECL(int) PDMR3BlkCacheSuspend(PPDMBLKCACHE pBlkCache) +{ + int rc = VINF_SUCCESS; + LogFlowFunc(("pBlkCache=%#p\n", pBlkCache)); + + AssertPtrReturn(pBlkCache, VERR_INVALID_POINTER); + + if (!ASMAtomicReadBool(&pBlkCache->pCache->fIoErrorVmSuspended)) + pdmBlkCacheCommit(pBlkCache); /* Can issue new I/O requests. */ + ASMAtomicXchgBool(&pBlkCache->fSuspended, true); + + /* Wait for all I/O to complete. */ + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + rc = RTAvlrU64DoWithAll(pBlkCache->pTree, true, pdmBlkCacheEntryQuiesce, NULL); + AssertRC(rc); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + return rc; +} + +VMMR3DECL(int) PDMR3BlkCacheResume(PPDMBLKCACHE pBlkCache) +{ + LogFlowFunc(("pBlkCache=%#p\n", pBlkCache)); + + AssertPtrReturn(pBlkCache, VERR_INVALID_POINTER); + + ASMAtomicXchgBool(&pBlkCache->fSuspended, false); + + return VINF_SUCCESS; +} + +VMMR3DECL(int) PDMR3BlkCacheClear(PPDMBLKCACHE pBlkCache) +{ + int rc = VINF_SUCCESS; + PPDMBLKCACHEGLOBAL pCache = pBlkCache->pCache; + + /* + * Commit all dirty entries now (they are waited on for completion during the + * destruction of the AVL tree below). + * The exception is if the VM was paused because of an I/O error before. + */ + if (!ASMAtomicReadBool(&pCache->fIoErrorVmSuspended)) + pdmBlkCacheCommit(pBlkCache); + + /* Make sure nobody is accessing the cache while we delete the tree. */ + pdmBlkCacheLockEnter(pCache); + RTSemRWRequestWrite(pBlkCache->SemRWEntries, RT_INDEFINITE_WAIT); + RTAvlrU64Destroy(pBlkCache->pTree, pdmBlkCacheEntryDestroy, pCache); + RTSemRWReleaseWrite(pBlkCache->SemRWEntries); + + pdmBlkCacheLockLeave(pCache); + return rc; +} + |