/* $Id: HGSMIMemAlloc.cpp $ */ /** @file * VBox Host Guest Shared Memory Interface (HGSMI) - Memory allocator. */ /* * Copyright (C) 2014-2022 Oracle and/or its affiliates. * * This file is part of VirtualBox base platform packages, as * available from https://www.virtualbox.org. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, in version 3 of the * License. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see . * * SPDX-License-Identifier: GPL-3.0-only */ /* * Memory allocator * ---------------- * * Area [0; AreaSize) contains only the data, control structures are separate. * Block sizes are power of 2: 32B, ..., 1MB * Area size can be anything and will be divided initially to largest possible free blocks. * * The entire area is described by a list of 32 bit block descriptors: * * bits 0..3 - order, which is log2 size of the block - 5: 2^(0+5) ... 2^(15+5) == 32B .. 1MB * * bit 4 - 1 for free blocks. * * bits 5..31 - block offset. * * 31 ... 5 | 4 | 3 ... 0 * offset F order * * There is a sorted collection of all block descriptors * (key is the block offset, bits 0...4 do not interfere with sorting). * Also there are lists of free blocks for each size for fast allocation. * * * Implementation * -------------- * * The blocks collection is a sorted linear list. * * Initially the entire area consists of one or more largest blocks followed by smaller blocks: * * 100B area - 64B block with descriptor: 0x00000011 * 32B block with descriptor: 0x00000030 * 4B unused * * 64K area - one 64K block with descriptor: 0x0000001C * * 512K area - one 512K block with descriptor: 0x0000001F * * When allocating a new block: * * larger free blocks are splitted when there are no smaller free blocks; * * smaller free blocks are merged if they can build a requested larger block. */ #include #include #include /* * We do not want assertions in Linux kernel code to reduce symbol dependencies. */ #if defined(IN_RING0) && defined(RT_OS_LINUX) # define HGSMI_ASSERT_RETURN(a, b) if (!(a)) return (b) # define HGSMI_ASSERT_FAILED() do {} while (0) # define HGSMI_ASSERT(expr) do {} while (0) #else # define HGSMI_ASSERT_RETURN(a, b) AssertReturn(a, b) # define HGSMI_ASSERT_FAILED() AssertFailed() # define HGSMI_ASSERT(expr) Assert(expr) #endif /* !IN_RING0 && RT_OS_LINUX */ DECLINLINE(HGSMIOFFSET) hgsmiMADescriptor(HGSMIOFFSET off, bool fFree, HGSMIOFFSET order) { return (off & HGSMI_MA_DESC_OFFSET_MASK) | (fFree? HGSMI_MA_DESC_FREE_MASK: 0) | (order & HGSMI_MA_DESC_ORDER_MASK); } static void hgsmiMABlockFree(HGSMIMADATA *pMA, HGSMIMABLOCK *pBlock) { pMA->env.pfnFree(pMA->env.pvEnv, pBlock); } static int hgsmiMABlockAlloc(HGSMIMADATA *pMA, HGSMIMABLOCK **ppBlock) { int rc = VINF_SUCCESS; HGSMIMABLOCK *pBlock = (HGSMIMABLOCK *)pMA->env.pfnAlloc(pMA->env.pvEnv, sizeof(HGSMIMABLOCK)); if (pBlock) { RT_ZERO(pBlock->nodeBlock); *ppBlock = pBlock; } else { rc = VERR_NO_MEMORY; } return rc; } /* Divide entire area to free blocks. */ static int hgsmiMAFormat(HGSMIMADATA *pMA) { int rc = VINF_SUCCESS; /* Initial value, it will be updated in the loop below. */ pMA->cbMaxBlock = HGSMI_MA_BLOCK_SIZE_MIN; pMA->cBlocks = 0; HGSMISIZE cbBlock = HGSMI_MA_BLOCK_SIZE_MAX; HGSMISIZE cbRemaining = pMA->area.cbArea; HGSMIOFFSET off = 0; while (cbBlock >= HGSMI_MA_BLOCK_SIZE_MIN) { /* Build a list of free memory blocks with u32BlockSize. */ uint32_t cBlocks = cbRemaining / cbBlock; if (cBlocks > 0) { if (pMA->cbMaxBlock < cbBlock) { pMA->cbMaxBlock = cbBlock; } HGSMIOFFSET order = HGSMIMASize2Order(cbBlock); uint32_t i; for (i = 0; i < cBlocks; ++i) { /* A new free block. */ HGSMIMABLOCK *pBlock; rc = hgsmiMABlockAlloc(pMA, &pBlock); if (RT_FAILURE(rc)) { break; } pBlock->descriptor = hgsmiMADescriptor(off, true, order); RTListAppend(&pMA->listBlocks, &pBlock->nodeBlock); ++pMA->cBlocks; off += cbBlock; cbRemaining -= cbBlock; } } if (RT_FAILURE(rc)) { break; } cbBlock /= 2; } return rc; } static int hgsmiMARebuildFreeLists(HGSMIMADATA *pMA) { int rc = VINF_SUCCESS; HGSMIMABLOCK *pIter; RTListForEach(&pMA->listBlocks, pIter, HGSMIMABLOCK, nodeBlock) { if (HGSMI_MA_DESC_IS_FREE(pIter->descriptor)) { HGSMIOFFSET order = HGSMI_MA_DESC_ORDER(pIter->descriptor); RTListAppend(&pMA->aListFreeBlocks[order], &pIter->nodeFree); } } return rc; } static int hgsmiMARestore(HGSMIMADATA *pMA, HGSMIOFFSET *paDescriptors, uint32_t cDescriptors, HGSMISIZE cbMaxBlock) { int rc = VINF_SUCCESS; pMA->cbMaxBlock = cbMaxBlock; pMA->cBlocks = 0; HGSMISIZE cbRemaining = pMA->area.cbArea; HGSMIOFFSET off = 0; uint32_t i; for (i = 0; i < cDescriptors; ++i) { /* Verify the descriptor. */ HGSMISIZE cbBlock = HGSMIMAOrder2Size(HGSMI_MA_DESC_ORDER(paDescriptors[i])); if ( off != HGSMI_MA_DESC_OFFSET(paDescriptors[i]) || cbBlock > cbRemaining || cbBlock > pMA->cbMaxBlock) { rc = VERR_INVALID_PARAMETER; break; } /* A new free block. */ HGSMIMABLOCK *pBlock; rc = hgsmiMABlockAlloc(pMA, &pBlock); if (RT_FAILURE(rc)) { break; } pBlock->descriptor = paDescriptors[i]; RTListAppend(&pMA->listBlocks, &pBlock->nodeBlock); ++pMA->cBlocks; off += cbBlock; cbRemaining -= cbBlock; } return rc; } static HGSMIMABLOCK *hgsmiMAGetFreeBlock(HGSMIMADATA *pMA, HGSMIOFFSET order) { HGSMIMABLOCK *pBlock = NULL; HGSMIOFFSET i; for (i = order; i < RT_ELEMENTS(pMA->aListFreeBlocks); ++i) { pBlock = RTListGetFirst(&pMA->aListFreeBlocks[i], HGSMIMABLOCK, nodeFree); if (pBlock) { break; } } if (pBlock) { HGSMI_ASSERT_RETURN(HGSMI_MA_DESC_IS_FREE(pBlock->descriptor), NULL); /* Where the block starts. */ HGSMIOFFSET off = HGSMI_MA_DESC_OFFSET(pBlock->descriptor); /* 'i' is the order of the block. */ while (i != order) { /* A larger block was found and need to be split to 2 smaller blocks. */ HGSMIMABLOCK *pBlock2; int rc = hgsmiMABlockAlloc(pMA, &pBlock2); if (RT_FAILURE(rc)) { pBlock = NULL; break; } /* Create 2 blocks with descreased order. */ --i; /* Remove from the free list. */ RTListNodeRemove(&pBlock->nodeFree); pBlock->descriptor = hgsmiMADescriptor(off, true, i); pBlock2->descriptor = hgsmiMADescriptor(off + HGSMIMAOrder2Size(i), true, i); /* Update list of all blocks by inserting pBlock2 after pBlock. */ RTListNodeInsertAfter(&pBlock->nodeBlock, &pBlock2->nodeBlock); ++pMA->cBlocks; /* Update the free list. */ RTListAppend(&pMA->aListFreeBlocks[i], &pBlock->nodeFree); RTListAppend(&pMA->aListFreeBlocks[i], &pBlock2->nodeFree); } } return pBlock; } static void hgsmiMAReformatFreeBlocks(HGSMIMADATA *pMA, HGSMIOFFSET maxId, HGSMIMABLOCK *pStart, HGSMIMABLOCK *pEnd, HGSMISIZE cbBlocks) { int rc = VINF_SUCCESS; /* * Blocks starting from pStart until pEnd will be replaced with * another set of blocks. * * The new set will include the block with the required order. * Since the required order is larger than any existing block, * it will replace at least two existing blocks. * The new set will also have minimal possible number of blocks. * Therefore the new set will have at least one block less. * Blocks will be updated in place and remaining blocks will be * deallocated. */ HGSMISIZE u32BlockSize = HGSMIMAOrder2Size(maxId); HGSMISIZE cbRemaining = cbBlocks; HGSMIOFFSET off = HGSMI_MA_DESC_OFFSET(pStart->descriptor); HGSMIMABLOCK *pBlock = pStart; while (u32BlockSize >= HGSMI_MA_BLOCK_SIZE_MIN && cbRemaining) { /* Build a list of free memory blocks with u32BlockSize. */ uint32_t cBlocks = cbRemaining / u32BlockSize; if (cBlocks > 0) { HGSMIOFFSET order = HGSMIMASize2Order(u32BlockSize); uint32_t i; for (i = 0; i < cBlocks; ++i) { if (pBlock == pEnd) { /* Should never happen because the new set of blocks is supposed to be smaller. */ HGSMI_ASSERT_FAILED(); rc = VERR_OUT_OF_RESOURCES; break; } /* Remove from the free list. */ RTListNodeRemove(&pBlock->nodeFree); pBlock->descriptor = hgsmiMADescriptor(off, true, order); RTListAppend(&pMA->aListFreeBlocks[order], &pBlock->nodeFree); off += u32BlockSize; cbRemaining -= u32BlockSize; pBlock = RTListGetNext(&pMA->listBlocks, pBlock, HGSMIMABLOCK, nodeBlock); } } if (RT_FAILURE(rc)) { break; } u32BlockSize /= 2; } HGSMI_ASSERT(cbRemaining == 0); if (RT_SUCCESS(rc)) { /* Remove remaining free blocks from pBlock until pEnd */ for (;;) { bool fEnd = (pBlock == pEnd); HGSMIMABLOCK *pNext = RTListGetNext(&pMA->listBlocks, pBlock, HGSMIMABLOCK, nodeBlock); RTListNodeRemove(&pBlock->nodeFree); RTListNodeRemove(&pBlock->nodeBlock); --pMA->cBlocks; hgsmiMABlockFree(pMA, pBlock); if (fEnd) { break; } pBlock = pNext; } } } static void hgsmiMAQueryFreeRange(HGSMIMADATA *pMA, HGSMIMABLOCK *pBlock, HGSMISIZE cbRequired, HGSMIMABLOCK **ppStart, HGSMIMABLOCK **ppEnd, HGSMISIZE *pcbBlocks) { HGSMI_ASSERT(HGSMI_MA_DESC_IS_FREE(pBlock->descriptor)); *pcbBlocks = HGSMIMAOrder2Size(HGSMI_MA_DESC_ORDER(pBlock->descriptor)); *ppStart = pBlock; *ppEnd = pBlock; HGSMIMABLOCK *p; for (;;) { p = RTListGetNext(&pMA->listBlocks, *ppEnd, HGSMIMABLOCK, nodeBlock); if (!p || !HGSMI_MA_DESC_IS_FREE(p->descriptor)) { break; } *pcbBlocks += HGSMIMAOrder2Size(HGSMI_MA_DESC_ORDER(p->descriptor)); *ppEnd = p; if (cbRequired && *pcbBlocks >= cbRequired) { return; } } for (;;) { p = RTListGetPrev(&pMA->listBlocks, *ppStart, HGSMIMABLOCK, nodeBlock); if (!p || !HGSMI_MA_DESC_IS_FREE(p->descriptor)) { break; } *pcbBlocks += HGSMIMAOrder2Size(HGSMI_MA_DESC_ORDER(p->descriptor)); *ppStart = p; if (cbRequired && *pcbBlocks >= cbRequired) { return; } } } static void hgsmiMAMergeFreeBlocks(HGSMIMADATA *pMA, HGSMIOFFSET order) { /* Try to create a free block with the order from smaller free blocks. */ if (order == 0) { /* No smaller blocks. */ return; } HGSMISIZE cbRequired = HGSMIMAOrder2Size(order); /* Scan all free lists of smaller blocks. * * Get the sequence of free blocks before and after each free block. * If possible, re-split the sequence to get the required block and other free block(s). * If not possible, try the next free block. * * Free blocks are scanned from i to 0 orders. */ HGSMIOFFSET i = order - 1; for (;;) { HGSMIMABLOCK *pIter; RTListForEach(&pMA->aListFreeBlocks[i], pIter, HGSMIMABLOCK, nodeFree) { HGSMI_ASSERT(HGSMI_MA_DESC_ORDER(pIter->descriptor) == i); HGSMISIZE cbBlocks; HGSMIMABLOCK *pFreeStart; HGSMIMABLOCK *pFreeEnd; hgsmiMAQueryFreeRange(pMA, pIter, cbRequired, &pFreeStart, &pFreeEnd, &cbBlocks); HGSMI_ASSERT((cbBlocks / HGSMI_MA_BLOCK_SIZE_MIN) * HGSMI_MA_BLOCK_SIZE_MIN == cbBlocks); /* Verify whether cbBlocks is enough for the requested block. */ if (cbBlocks >= cbRequired) { /* Build new free blocks starting from the requested. */ hgsmiMAReformatFreeBlocks(pMA, order, pFreeStart, pFreeEnd, cbBlocks); i = 0; /* Leave the loop. */ break; } } if (i == 0) { break; } --i; } } static HGSMIOFFSET hgsmiMAAlloc(HGSMIMADATA *pMA, HGSMISIZE cb) { if (cb > pMA->cbMaxBlock) { return HGSMIOFFSET_VOID; } if (cb < HGSMI_MA_BLOCK_SIZE_MIN) { cb = HGSMI_MA_BLOCK_SIZE_MIN; } HGSMIOFFSET order = HGSMIPopCnt32(cb - 1) - HGSMI_MA_DESC_ORDER_BASE; HGSMI_ASSERT_RETURN(HGSMIMAOrder2Size(order) >= cb, HGSMIOFFSET_VOID); HGSMI_ASSERT_RETURN(order < RT_ELEMENTS(pMA->aListFreeBlocks), HGSMIOFFSET_VOID); HGSMIMABLOCK *pBlock = hgsmiMAGetFreeBlock(pMA, order); if (RT_UNLIKELY(pBlock == NULL)) { /* No free block with large enough size. Merge smaller free blocks and try again. */ hgsmiMAMergeFreeBlocks(pMA, order); pBlock = hgsmiMAGetFreeBlock(pMA, order); } if (RT_LIKELY(pBlock != NULL)) { RTListNodeRemove(&pBlock->nodeFree); pBlock->descriptor &= ~HGSMI_MA_DESC_FREE_MASK; return HGSMI_MA_DESC_OFFSET(pBlock->descriptor); } return HGSMIOFFSET_VOID; } static void hgsmiMAFree(HGSMIMADATA *pMA, HGSMIOFFSET off) { if (off == HGSMIOFFSET_VOID) { return; } /* Find the block corresponding to the offset. */ HGSMI_ASSERT((off / HGSMI_MA_BLOCK_SIZE_MIN) * HGSMI_MA_BLOCK_SIZE_MIN == off); HGSMIMABLOCK *pBlock = HGSMIMASearchOffset(pMA, off); if (pBlock) { if (HGSMI_MA_DESC_OFFSET(pBlock->descriptor) == off) { /* Found the right block, mark it as free. */ pBlock->descriptor |= HGSMI_MA_DESC_FREE_MASK; RTListAppend(&pMA->aListFreeBlocks[HGSMI_MA_DESC_ORDER(pBlock->descriptor)], &pBlock->nodeFree); return; } } HGSMI_ASSERT_FAILED(); } int HGSMIMAInit(HGSMIMADATA *pMA, const HGSMIAREA *pArea, HGSMIOFFSET *paDescriptors, uint32_t cDescriptors, HGSMISIZE cbMaxBlock, const HGSMIENV *pEnv) { HGSMI_ASSERT_RETURN(pArea->cbArea < UINT32_C(0x80000000), VERR_INVALID_PARAMETER); HGSMI_ASSERT_RETURN(pArea->cbArea >= HGSMI_MA_BLOCK_SIZE_MIN, VERR_INVALID_PARAMETER); RT_ZERO(*pMA); HGSMISIZE cb = (pArea->cbArea / HGSMI_MA_BLOCK_SIZE_MIN) * HGSMI_MA_BLOCK_SIZE_MIN; int rc = HGSMIAreaInitialize(&pMA->area, pArea->pu8Base, cb, 0); if (RT_SUCCESS(rc)) { pMA->env = *pEnv; uint32_t i; for (i = 0; i < RT_ELEMENTS(pMA->aListFreeBlocks); ++i) { RTListInit(&pMA->aListFreeBlocks[i]); } RTListInit(&pMA->listBlocks); if (cDescriptors) { rc = hgsmiMARestore(pMA, paDescriptors, cDescriptors, cbMaxBlock); } else { rc = hgsmiMAFormat(pMA); } if (RT_SUCCESS(rc)) { rc = hgsmiMARebuildFreeLists(pMA); } } return rc; } void HGSMIMAUninit(HGSMIMADATA *pMA) { HGSMIMABLOCK *pIter; HGSMIMABLOCK *pNext; /* If it has been initialized. */ if (pMA->listBlocks.pNext) { RTListForEachSafe(&pMA->listBlocks, pIter, pNext, HGSMIMABLOCK, nodeBlock) { RTListNodeRemove(&pIter->nodeBlock); hgsmiMABlockFree(pMA, pIter); } } RT_ZERO(*pMA); } HGSMIOFFSET HGSMIMAPointerToOffset(const HGSMIMADATA *pMA, const void RT_UNTRUSTED_VOLATILE_GUEST *pv) { uintptr_t off = (uintptr_t)pv - (uintptr_t)pMA->area.pu8Base; if (off < pMA->area.cbArea) return pMA->area.offBase + off; HGSMI_ASSERT_FAILED(); return HGSMIOFFSET_VOID; } static void RT_UNTRUSTED_VOLATILE_HSTGST *HGSMIMAOffsetToPointer(const HGSMIMADATA *pMA, HGSMIOFFSET off) { if (HGSMIAreaContainsOffset(&pMA->area, off)) { return HGSMIOffsetToPointer(&pMA->area, off); } HGSMI_ASSERT_FAILED(); return NULL; } void RT_UNTRUSTED_VOLATILE_HSTGST *HGSMIMAAlloc(HGSMIMADATA *pMA, HGSMISIZE cb) { HGSMIOFFSET off = hgsmiMAAlloc(pMA, cb); return HGSMIMAOffsetToPointer(pMA, off); } void HGSMIMAFree(HGSMIMADATA *pMA, void RT_UNTRUSTED_VOLATILE_GUEST *pv) { HGSMIOFFSET off = HGSMIMAPointerToOffset(pMA, pv); if (off != HGSMIOFFSET_VOID) { hgsmiMAFree(pMA, off); } else { HGSMI_ASSERT_FAILED(); } } HGSMIMABLOCK *HGSMIMASearchOffset(HGSMIMADATA *pMA, HGSMIOFFSET off) { /* Binary search in the block list for the offset. */ HGSMIMABLOCK *pStart = RTListGetFirst(&pMA->listBlocks, HGSMIMABLOCK, nodeBlock); HGSMIMABLOCK *pEnd = RTListGetLast(&pMA->listBlocks, HGSMIMABLOCK, nodeBlock); HGSMIMABLOCK *pMiddle; uint32_t iStart = 0; uint32_t iEnd = pMA->cBlocks; uint32_t iMiddle; for (;;) { pMiddle = pStart; iMiddle = iStart + (iEnd - iStart) / 2; if (iMiddle == iStart) { break; } /* Find the block with the iMiddle index. Never go further than pEnd. */ uint32_t i; for (i = iStart; i < iMiddle && pMiddle != pEnd; ++i) { pMiddle = RTListNodeGetNext(&pMiddle->nodeBlock, HGSMIMABLOCK, nodeBlock); } HGSMIOFFSET offMiddle = HGSMI_MA_DESC_OFFSET(pMiddle->descriptor); if (offMiddle > off) { pEnd = pMiddle; iEnd = iMiddle; } else { pStart = pMiddle; iStart = iMiddle; } } return pMiddle; } /* * Helper. */ uint32_t HGSMIPopCnt32(uint32_t u32) { uint32_t c = 0; if (u32 > 0xFFFF) { c += 16; u32 >>= 16; } if (u32 > 0xFF) { c += 8; u32 >>= 8; } if (u32 > 0xF) { c += 4; u32 >>= 4; } if (u32 > 0x3) { c += 2; u32 >>= 2; } if (u32 > 0x1) { c += 1; u32 >>= 1; } return c + u32; }