summaryrefslogtreecommitdiffstats
path: root/src/VBox/Runtime/r0drv/linux/alloc-r0drv-linux.c
blob: e724091590e96f11b5cc99f3218ceb0a82cdf25e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
/* $Id: alloc-r0drv-linux.c $ */
/** @file
 * IPRT - Memory Allocation, Ring-0 Driver, Linux.
 */

/*
 * Copyright (C) 2006-2019 Oracle Corporation
 *
 * This file is part of VirtualBox Open Source Edition (OSE), as
 * available from http://www.virtualbox.org. This file is free software;
 * you can redistribute it and/or modify it under the terms of the GNU
 * General Public License (GPL) as published by the Free Software
 * Foundation, in version 2 as it comes in the "COPYING" file of the
 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
 *
 * The contents of this file may alternatively be used under the terms
 * of the Common Development and Distribution License Version 1.0
 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
 * VirtualBox OSE distribution, in which case the provisions of the
 * CDDL are applicable instead of those of the GPL.
 *
 * You may elect to license modified versions of this file under the
 * terms and conditions of either the GPL or the CDDL or both.
 */


/*********************************************************************************************************************************
*   Header Files                                                                                                                 *
*********************************************************************************************************************************/
#include "the-linux-kernel.h"
#include "internal/iprt.h"
#include <iprt/mem.h>

#include <iprt/assert.h>
#include <iprt/errcore.h>
#include "r0drv/alloc-r0drv.h"


#if (defined(RT_ARCH_AMD64) || defined(DOXYGEN_RUNNING)) && !defined(RTMEMALLOC_EXEC_HEAP)
# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 23)
/**
 * Starting with 2.6.23 we can use __get_vm_area and map_vm_area to allocate
 * memory in the moduel range.  This is preferrable to the exec heap below.
 */
#  define RTMEMALLOC_EXEC_VM_AREA
# else
/**
 * We need memory in the module range (~2GB to ~0) this can only be obtained
 * thru APIs that are not exported (see module_alloc()).
 *
 * So, we'll have to create a quick and dirty heap here using BSS memory.
 * Very annoying and it's going to restrict us!
 */
#  define RTMEMALLOC_EXEC_HEAP
# endif
#endif

#ifdef RTMEMALLOC_EXEC_HEAP
# include <iprt/heap.h>
# include <iprt/spinlock.h>
# include <iprt/errcore.h>
#endif


/*********************************************************************************************************************************
*   Structures and Typedefs                                                                                                      *
*********************************************************************************************************************************/
#ifdef RTMEMALLOC_EXEC_VM_AREA
/**
 * Extended header used for headers marked with RTMEMHDR_FLAG_EXEC_VM_AREA.
 *
 * This is used with allocating executable memory, for things like generated
 * code and loaded modules.
 */
typedef struct RTMEMLNXHDREX
{
    /** The VM area for this allocation. */
    struct vm_struct   *pVmArea;
    void               *pvDummy;
    /** The header we present to the generic API. */
    RTMEMHDR            Hdr;
} RTMEMLNXHDREX;
AssertCompileSize(RTMEMLNXHDREX, 32);
/** Pointer to an extended memory header. */
typedef RTMEMLNXHDREX *PRTMEMLNXHDREX;
#endif


/*********************************************************************************************************************************
*   Global Variables                                                                                                             *
*********************************************************************************************************************************/
#ifdef RTMEMALLOC_EXEC_HEAP
/** The heap. */
static RTHEAPSIMPLE g_HeapExec = NIL_RTHEAPSIMPLE;
/** Spinlock protecting the heap. */
static RTSPINLOCK   g_HeapExecSpinlock = NIL_RTSPINLOCK;
#endif


/**
 * API for cleaning up the heap spinlock on IPRT termination.
 * This is as RTMemExecDonate specific to AMD64 Linux/GNU.
 */
DECLHIDDEN(void) rtR0MemExecCleanup(void)
{
#ifdef RTMEMALLOC_EXEC_HEAP
    RTSpinlockDestroy(g_HeapExecSpinlock);
    g_HeapExecSpinlock = NIL_RTSPINLOCK;
#endif
}


/**
 * Donate read+write+execute memory to the exec heap.
 *
 * This API is specific to AMD64 and Linux/GNU. A kernel module that desires to
 * use RTMemExecAlloc on AMD64 Linux/GNU will have to donate some statically
 * allocated memory in the module if it wishes for GCC generated code to work.
 * GCC can only generate modules that work in the address range ~2GB to ~0
 * currently.
 *
 * The API only accept one single donation.
 *
 * @returns IPRT status code.
 * @retval  VERR_NOT_SUPPORTED if the code isn't enabled.
 * @param   pvMemory    Pointer to the memory block.
 * @param   cb          The size of the memory block.
 */
RTR0DECL(int) RTR0MemExecDonate(void *pvMemory, size_t cb)
{
#ifdef RTMEMALLOC_EXEC_HEAP
    int rc;
    AssertReturn(g_HeapExec == NIL_RTHEAPSIMPLE, VERR_WRONG_ORDER);

    rc = RTSpinlockCreate(&g_HeapExecSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_SAFE, "RTR0MemExecDonate");
    if (RT_SUCCESS(rc))
    {
        rc = RTHeapSimpleInit(&g_HeapExec, pvMemory, cb);
        if (RT_FAILURE(rc))
            rtR0MemExecCleanup();
    }
    return rc;
#else
    RT_NOREF_PV(pvMemory); RT_NOREF_PV(cb);
    return VERR_NOT_SUPPORTED;
#endif
}
RT_EXPORT_SYMBOL(RTR0MemExecDonate);



#ifdef RTMEMALLOC_EXEC_VM_AREA
/**
 * Allocate executable kernel memory in the module range.
 *
 * @returns Pointer to a allocation header success.  NULL on failure.
 *
 * @param   cb          The size the user requested.
 */
static PRTMEMHDR rtR0MemAllocExecVmArea(size_t cb)
{
    size_t const        cbAlloc = RT_ALIGN_Z(sizeof(RTMEMLNXHDREX) + cb, PAGE_SIZE);
    size_t const        cPages  = cbAlloc >> PAGE_SHIFT;
    struct page       **papPages;
    struct vm_struct   *pVmArea;
    size_t              iPage;

    pVmArea = __get_vm_area(cbAlloc, VM_ALLOC, MODULES_VADDR, MODULES_END);
    if (!pVmArea)
        return NULL;
    pVmArea->nr_pages = 0;    /* paranoia? */
    pVmArea->pages    = NULL; /* paranoia? */

    papPages = (struct page **)kmalloc(cPages * sizeof(papPages[0]), GFP_KERNEL | __GFP_NOWARN);
    if (!papPages)
    {
        vunmap(pVmArea->addr);
        return NULL;
    }

    for (iPage = 0; iPage < cPages; iPage++)
    {
        papPages[iPage] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN);
        if (!papPages[iPage])
            break;
    }
    if (iPage == cPages)
    {
        /*
         * Map the pages.
         *
         * Not entirely sure we really need to set nr_pages and pages here, but
         * they provide a very convenient place for storing something we need
         * in the free function, if nothing else...
         */
# if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0)
        struct page **papPagesIterator = papPages;
# endif
        pVmArea->nr_pages = cPages;
        pVmArea->pages    = papPages;
        if (!map_vm_area(pVmArea, PAGE_KERNEL_EXEC,
# if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0)
                         &papPagesIterator
# else
                         papPages
# endif
                         ))
        {
            PRTMEMLNXHDREX pHdrEx = (PRTMEMLNXHDREX)pVmArea->addr;
            pHdrEx->pVmArea     = pVmArea;
            pHdrEx->pvDummy     = NULL;
            return &pHdrEx->Hdr;
        }
        /* bail out */
# if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0)
        pVmArea->nr_pages = papPagesIterator - papPages;
# endif
    }

    vunmap(pVmArea->addr);

    while (iPage-- > 0)
        __free_page(papPages[iPage]);
    kfree(papPages);

    return NULL;
}
#endif /* RTMEMALLOC_EXEC_VM_AREA */


/**
 * OS specific allocation function.
 */
DECLHIDDEN(int) rtR0MemAllocEx(size_t cb, uint32_t fFlags, PRTMEMHDR *ppHdr)
{
    PRTMEMHDR pHdr;
    IPRT_LINUX_SAVE_EFL_AC();

    /*
     * Allocate.
     */
    if (fFlags & RTMEMHDR_FLAG_EXEC)
    {
        if (fFlags & RTMEMHDR_FLAG_ANY_CTX)
            return VERR_NOT_SUPPORTED;

#if defined(RT_ARCH_AMD64)
# ifdef RTMEMALLOC_EXEC_HEAP
        if (g_HeapExec != NIL_RTHEAPSIMPLE)
        {
            RTSpinlockAcquire(g_HeapExecSpinlock);
            pHdr = (PRTMEMHDR)RTHeapSimpleAlloc(g_HeapExec, cb + sizeof(*pHdr), 0);
            RTSpinlockRelease(g_HeapExecSpinlock);
            fFlags |= RTMEMHDR_FLAG_EXEC_HEAP;
        }
        else
            pHdr = NULL;

# elif defined(RTMEMALLOC_EXEC_VM_AREA)
        pHdr = rtR0MemAllocExecVmArea(cb);
        fFlags |= RTMEMHDR_FLAG_EXEC_VM_AREA;

# else  /* !RTMEMALLOC_EXEC_HEAP */
# error "you don not want to go here..."
        pHdr = (PRTMEMHDR)__vmalloc(cb + sizeof(*pHdr), GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, MY_PAGE_KERNEL_EXEC);
# endif /* !RTMEMALLOC_EXEC_HEAP */

#elif defined(PAGE_KERNEL_EXEC) && defined(CONFIG_X86_PAE)
        pHdr = (PRTMEMHDR)__vmalloc(cb + sizeof(*pHdr), GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, MY_PAGE_KERNEL_EXEC);
#else
        pHdr = (PRTMEMHDR)vmalloc(cb + sizeof(*pHdr));
#endif
    }
    else
    {
        if (
#if 1 /* vmalloc has serious performance issues, avoid it. */
               cb <= PAGE_SIZE*16 - sizeof(*pHdr)
#else
               cb <= PAGE_SIZE
#endif
            || (fFlags & RTMEMHDR_FLAG_ANY_CTX)
           )
        {
            fFlags |= RTMEMHDR_FLAG_KMALLOC;
            pHdr = kmalloc(cb + sizeof(*pHdr),
                           (fFlags & RTMEMHDR_FLAG_ANY_CTX_ALLOC) ? (GFP_ATOMIC | __GFP_NOWARN)
                                                                  : (GFP_KERNEL | __GFP_NOWARN));
            if (RT_UNLIKELY(   !pHdr
                            && cb > PAGE_SIZE
                            && !(fFlags & RTMEMHDR_FLAG_ANY_CTX) ))
            {
                fFlags &= ~RTMEMHDR_FLAG_KMALLOC;
                pHdr = vmalloc(cb + sizeof(*pHdr));
            }
        }
        else
            pHdr = vmalloc(cb + sizeof(*pHdr));
    }
    if (RT_UNLIKELY(!pHdr))
    {
        IPRT_LINUX_RESTORE_EFL_AC();
        return VERR_NO_MEMORY;
    }

    /*
     * Initialize.
     */
    pHdr->u32Magic  = RTMEMHDR_MAGIC;
    pHdr->fFlags    = fFlags;
    pHdr->cb        = cb;
    pHdr->cbReq     = cb;

    *ppHdr = pHdr;
    IPRT_LINUX_RESTORE_EFL_AC();
    return VINF_SUCCESS;
}


/**
 * OS specific free function.
 */
DECLHIDDEN(void) rtR0MemFree(PRTMEMHDR pHdr)
{
    IPRT_LINUX_SAVE_EFL_AC();

    pHdr->u32Magic += 1;
    if (pHdr->fFlags & RTMEMHDR_FLAG_KMALLOC)
        kfree(pHdr);
#ifdef RTMEMALLOC_EXEC_HEAP
    else if (pHdr->fFlags & RTMEMHDR_FLAG_EXEC_HEAP)
    {
        RTSpinlockAcquire(g_HeapExecSpinlock);
        RTHeapSimpleFree(g_HeapExec, pHdr);
        RTSpinlockRelease(g_HeapExecSpinlock);
    }
#endif
#ifdef RTMEMALLOC_EXEC_VM_AREA
    else if (pHdr->fFlags & RTMEMHDR_FLAG_EXEC_VM_AREA)
    {
        PRTMEMLNXHDREX pHdrEx    = RT_FROM_MEMBER(pHdr, RTMEMLNXHDREX, Hdr);
        size_t         iPage     = pHdrEx->pVmArea->nr_pages;
        struct page  **papPages  = pHdrEx->pVmArea->pages;
        void          *pvMapping = pHdrEx->pVmArea->addr;

        vunmap(pvMapping);

        while (iPage-- > 0)
            __free_page(papPages[iPage]);
        kfree(papPages);
    }
#endif
    else
        vfree(pHdr);

    IPRT_LINUX_RESTORE_EFL_AC();
}



/**
 * Compute order. Some functions allocate 2^order pages.
 *
 * @returns order.
 * @param   cPages      Number of pages.
 */
static int CalcPowerOf2Order(unsigned long cPages)
{
    int             iOrder;
    unsigned long   cTmp;

    for (iOrder = 0, cTmp = cPages; cTmp >>= 1; ++iOrder)
        ;
    if (cPages & ~(1 << iOrder))
        ++iOrder;

    return iOrder;
}


/**
 * Allocates physical contiguous memory (below 4GB).
 * The allocation is page aligned and the content is undefined.
 *
 * @returns Pointer to the memory block. This is page aligned.
 * @param   pPhys   Where to store the physical address.
 * @param   cb      The allocation size in bytes. This is always
 *                  rounded up to PAGE_SIZE.
 */
RTR0DECL(void *) RTMemContAlloc(PRTCCPHYS pPhys, size_t cb)
{
    int             cOrder;
    unsigned        cPages;
    struct page    *paPages;
    void           *pvRet;
    IPRT_LINUX_SAVE_EFL_AC();

    /*
     * validate input.
     */
    Assert(VALID_PTR(pPhys));
    Assert(cb > 0);

    /*
     * Allocate page pointer array.
     */
    cb = RT_ALIGN_Z(cb, PAGE_SIZE);
    cPages = cb >> PAGE_SHIFT;
    cOrder = CalcPowerOf2Order(cPages);
#if (defined(RT_ARCH_AMD64) || defined(CONFIG_X86_PAE)) && defined(GFP_DMA32)
    /* ZONE_DMA32: 0-4GB */
    paPages = alloc_pages(GFP_DMA32 | __GFP_NOWARN, cOrder);
    if (!paPages)
#endif
#ifdef RT_ARCH_AMD64
        /* ZONE_DMA; 0-16MB */
        paPages = alloc_pages(GFP_DMA | __GFP_NOWARN, cOrder);
#else
        /* ZONE_NORMAL: 0-896MB */
        paPages = alloc_pages(GFP_USER | __GFP_NOWARN, cOrder);
#endif
    if (paPages)
    {
        /*
         * Reserve the pages and mark them executable.
         */
        unsigned iPage;
        for (iPage = 0; iPage < cPages; iPage++)
        {
            Assert(!PageHighMem(&paPages[iPage]));
            if (iPage + 1 < cPages)
            {
                AssertMsg(          (uintptr_t)phys_to_virt(page_to_phys(&paPages[iPage])) + PAGE_SIZE
                                ==  (uintptr_t)phys_to_virt(page_to_phys(&paPages[iPage + 1]))
                          &&        page_to_phys(&paPages[iPage]) + PAGE_SIZE
                                ==  page_to_phys(&paPages[iPage + 1]),
                          ("iPage=%i cPages=%u [0]=%#llx,%p [1]=%#llx,%p\n", iPage, cPages,
                           (long long)page_to_phys(&paPages[iPage]),     phys_to_virt(page_to_phys(&paPages[iPage])),
                           (long long)page_to_phys(&paPages[iPage + 1]), phys_to_virt(page_to_phys(&paPages[iPage + 1])) ));
            }

            SetPageReserved(&paPages[iPage]);
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 4, 20) /** @todo find the exact kernel where change_page_attr was introduced. */
            MY_SET_PAGES_EXEC(&paPages[iPage], 1);
#endif
        }
        *pPhys = page_to_phys(paPages);
        pvRet = phys_to_virt(page_to_phys(paPages));
    }
    else
        pvRet = NULL;

    IPRT_LINUX_RESTORE_EFL_AC();
    return pvRet;
}
RT_EXPORT_SYMBOL(RTMemContAlloc);


/**
 * Frees memory allocated using RTMemContAlloc().
 *
 * @param   pv      Pointer to return from RTMemContAlloc().
 * @param   cb      The cb parameter passed to RTMemContAlloc().
 */
RTR0DECL(void) RTMemContFree(void *pv, size_t cb)
{
    if (pv)
    {
        int             cOrder;
        unsigned        cPages;
        unsigned        iPage;
        struct page    *paPages;
        IPRT_LINUX_SAVE_EFL_AC();

        /* validate */
        AssertMsg(!((uintptr_t)pv & PAGE_OFFSET_MASK), ("pv=%p\n", pv));
        Assert(cb > 0);

        /* calc order and get pages */
        cb = RT_ALIGN_Z(cb, PAGE_SIZE);
        cPages = cb >> PAGE_SHIFT;
        cOrder = CalcPowerOf2Order(cPages);
        paPages = virt_to_page(pv);

        /*
         * Restore page attributes freeing the pages.
         */
        for (iPage = 0; iPage < cPages; iPage++)
        {
            ClearPageReserved(&paPages[iPage]);
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 4, 20) /** @todo find the exact kernel where change_page_attr was introduced. */
            MY_SET_PAGES_NOEXEC(&paPages[iPage], 1);
#endif
        }
        __free_pages(paPages, cOrder);
        IPRT_LINUX_RESTORE_EFL_AC();
    }
}
RT_EXPORT_SYMBOL(RTMemContFree);