diff options
Diffstat (limited to 'src/runtime/mem_linux.go')
-rw-r--r-- | src/runtime/mem_linux.go | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go new file mode 100644 index 0000000..3436851 --- /dev/null +++ b/src/runtime/mem_linux.go @@ -0,0 +1,174 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +import ( + "runtime/internal/atomic" + "unsafe" +) + +const ( + _EACCES = 13 + _EINVAL = 22 +) + +// Don't split the stack as this method may be invoked without a valid G, which +// prevents us from allocating more stack. +//go:nosplit +func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer { + p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0) + if err != 0 { + if err == _EACCES { + print("runtime: mmap: access denied\n") + exit(2) + } + if err == _EAGAIN { + print("runtime: mmap: too much locked memory (check 'ulimit -l').\n") + exit(2) + } + return nil + } + sysStat.add(int64(n)) + return p +} + +var adviseUnused = uint32(_MADV_FREE) + +func sysUnused(v unsafe.Pointer, n uintptr) { + // By default, Linux's "transparent huge page" support will + // merge pages into a huge page if there's even a single + // present regular page, undoing the effects of madvise(adviseUnused) + // below. On amd64, that means khugepaged can turn a single + // 4KB page to 2MB, bloating the process's RSS by as much as + // 512X. (See issue #8832 and Linux kernel bug + // https://bugzilla.kernel.org/show_bug.cgi?id=93111) + // + // To work around this, we explicitly disable transparent huge + // pages when we release pages of the heap. However, we have + // to do this carefully because changing this flag tends to + // split the VMA (memory mapping) containing v in to three + // VMAs in order to track the different values of the + // MADV_NOHUGEPAGE flag in the different regions. There's a + // default limit of 65530 VMAs per address space (sysctl + // vm.max_map_count), so we must be careful not to create too + // many VMAs (see issue #12233). + // + // Since huge pages are huge, there's little use in adjusting + // the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid + // exploding the number of VMAs by only adjusting the + // MADV_NOHUGEPAGE flag on a large granularity. This still + // gets most of the benefit of huge pages while keeping the + // number of VMAs under control. With hugePageSize = 2MB, even + // a pessimal heap can reach 128GB before running out of VMAs. + if physHugePageSize != 0 { + // If it's a large allocation, we want to leave huge + // pages enabled. Hence, we only adjust the huge page + // flag on the huge pages containing v and v+n-1, and + // only if those aren't aligned. + var head, tail uintptr + if uintptr(v)&(physHugePageSize-1) != 0 { + // Compute huge page containing v. + head = alignDown(uintptr(v), physHugePageSize) + } + if (uintptr(v)+n)&(physHugePageSize-1) != 0 { + // Compute huge page containing v+n-1. + tail = alignDown(uintptr(v)+n-1, physHugePageSize) + } + + // Note that madvise will return EINVAL if the flag is + // already set, which is quite likely. We ignore + // errors. + if head != 0 && head+physHugePageSize == tail { + // head and tail are different but adjacent, + // so do this in one call. + madvise(unsafe.Pointer(head), 2*physHugePageSize, _MADV_NOHUGEPAGE) + } else { + // Advise the huge pages containing v and v+n-1. + if head != 0 { + madvise(unsafe.Pointer(head), physHugePageSize, _MADV_NOHUGEPAGE) + } + if tail != 0 && tail != head { + madvise(unsafe.Pointer(tail), physHugePageSize, _MADV_NOHUGEPAGE) + } + } + } + + if uintptr(v)&(physPageSize-1) != 0 || n&(physPageSize-1) != 0 { + // madvise will round this to any physical page + // *covered* by this range, so an unaligned madvise + // will release more memory than intended. + throw("unaligned sysUnused") + } + + var advise uint32 + if debug.madvdontneed != 0 { + advise = _MADV_DONTNEED + } else { + advise = atomic.Load(&adviseUnused) + } + if errno := madvise(v, n, int32(advise)); advise == _MADV_FREE && errno != 0 { + // MADV_FREE was added in Linux 4.5. Fall back to MADV_DONTNEED if it is + // not supported. + atomic.Store(&adviseUnused, _MADV_DONTNEED) + madvise(v, n, _MADV_DONTNEED) + } +} + +func sysUsed(v unsafe.Pointer, n uintptr) { + // Partially undo the NOHUGEPAGE marks from sysUnused + // for whole huge pages between v and v+n. This may + // leave huge pages off at the end points v and v+n + // even though allocations may cover these entire huge + // pages. We could detect this and undo NOHUGEPAGE on + // the end points as well, but it's probably not worth + // the cost because when neighboring allocations are + // freed sysUnused will just set NOHUGEPAGE again. + sysHugePage(v, n) +} + +func sysHugePage(v unsafe.Pointer, n uintptr) { + if physHugePageSize != 0 { + // Round v up to a huge page boundary. + beg := alignUp(uintptr(v), physHugePageSize) + // Round v+n down to a huge page boundary. + end := alignDown(uintptr(v)+n, physHugePageSize) + + if beg < end { + madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE) + } + } +} + +// Don't split the stack as this function may be invoked without a valid G, +// which prevents us from allocating more stack. +//go:nosplit +func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) { + sysStat.add(-int64(n)) + munmap(v, n) +} + +func sysFault(v unsafe.Pointer, n uintptr) { + mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, -1, 0) +} + +func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer { + p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0) + if err != 0 { + return nil + } + return p +} + +func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) { + sysStat.add(int64(n)) + + p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0) + if err == _ENOMEM { + throw("runtime: out of memory") + } + if p != v || err != 0 { + throw("runtime: cannot map pages in arena address space") + } +} |