diff options
Diffstat (limited to '')
-rw-r--r-- | mysys/my_largepage.c | 488 |
1 files changed, 488 insertions, 0 deletions
diff --git a/mysys/my_largepage.c b/mysys/my_largepage.c new file mode 100644 index 00000000..71527a9b --- /dev/null +++ b/mysys/my_largepage.c @@ -0,0 +1,488 @@ +/* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + Copyright (c) 2019, 2020 IBM. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "mysys_priv.h" +#include <mysys_err.h> + +#ifdef __linux__ +#include <dirent.h> +#endif +#if defined(__linux__) || defined(MAP_ALIGNED) +#include "my_bit.h" +#endif +#ifdef HAVE_LINUX_MMAN_H +#include <linux/mman.h> +#endif + +#ifdef HAVE_SOLARIS_LARGE_PAGES +#if defined(__sun__) && defined(__GNUC__) && defined(__cplusplus) \ + && defined(_XOPEN_SOURCE) +/* memcntl exist within sys/mman.h, but under-defines what is need to use it */ +extern int memcntl(caddr_t, size_t, int, caddr_t, int, int); +#endif /* __sun__ ... */ +#endif /* HAVE_SOLARIS_LARGE_PAGES */ + +#if defined(_WIN32) +static size_t my_large_page_size; +#define HAVE_LARGE_PAGES +#elif defined(HAVE_MMAP) +#define HAVE_LARGE_PAGES +#endif + +#ifdef HAVE_LARGE_PAGES +static my_bool my_use_large_pages= 0; +#else +#define my_use_large_pages 0 +#endif + +#if defined(HAVE_GETPAGESIZES) || defined(__linux__) +/* Descending sort */ + +static int size_t_cmp(const void *a, const void *b) +{ + const size_t ia= *(const size_t *) a; + const size_t ib= *(const size_t *) b; + if (ib > ia) + { + return 1; + } + else if (ib < ia) + { + return -1; + } + return 0; +} +#endif /* defined(HAVE_GETPAGESIZES) || defined(__linux__) */ + + +#if defined(__linux__) || defined(HAVE_GETPAGESIZES) +#define my_large_page_sizes_length 8 +static size_t my_large_page_sizes[my_large_page_sizes_length]; +#endif + +/** + Linux-specific function to determine the sizes of large pages +*/ +#ifdef __linux__ +static inline my_bool my_is_2pow(size_t n) { return !((n) & ((n) - 1)); } + +static void my_get_large_page_sizes(size_t sizes[my_large_page_sizes_length]) +{ + DIR *dirp; + struct dirent *r; + int i= 0; + DBUG_ENTER("my_get_large_page_sizes"); + + dirp= opendir("/sys/kernel/mm/hugepages"); + if (dirp == NULL) + { + my_error(EE_DIR, MYF(ME_BELL), "/sys/kernel/mm/hugepages", errno); + } + else + { + while (i < my_large_page_sizes_length && (r= readdir(dirp))) + { + if (strncmp("hugepages-", r->d_name, 10) == 0) + { + sizes[i]= strtoull(r->d_name + 10, NULL, 10) * 1024ULL; + if (!my_is_2pow(sizes[i])) + { + my_printf_error(0, + "non-power of 2 large page size (%zu) found," + " skipping", MYF(ME_NOTE | ME_ERROR_LOG_ONLY), + sizes[i]); + sizes[i]= 0; + continue; + } + ++i; + } + } + if (closedir(dirp)) + { + my_error(EE_BADCLOSE, MYF(ME_BELL), "/sys/kernel/mm/hugepages", errno); + } + qsort(sizes, i, sizeof(size_t), size_t_cmp); + } + DBUG_VOID_RETURN; +} + + +#elif defined(HAVE_GETPAGESIZES) +static void my_get_large_page_sizes(size_t sizes[my_large_page_sizes_length]) +{ + int nelem; + + nelem= getpagesizes(NULL, 0); + + assert(nelem <= my_large_page_sizes_length); + getpagesizes(sizes, my_large_page_sizes_length); + qsort(sizes, nelem, sizeof(size_t), size_t_cmp); + if (nelem < my_large_page_sizes_length) + { + sizes[nelem]= 0; + } +} + + +#elif defined(_WIN32) +#define my_large_page_sizes_length 0 +#define my_get_large_page_sizes(A) do {} while(0) + +#else +#define my_large_page_sizes_length 1 +static size_t my_large_page_sizes[my_large_page_sizes_length]; +static void my_get_large_page_sizes(size_t sizes[]) +{ + sizes[0]= my_getpagesize(); +} +#endif + + +/** + Returns the next large page size smaller or equal to the passed in size. + + The search starts at my_large_page_sizes[*start]. + + Assumes my_get_large_page_sizes(my_large_page_sizes) has been called before + use. + + For first use, have *start=0. There is no need to increment *start. + + @param[in] sz size to be searched for. + @param[in,out] start ptr to int representing offset in my_large_page_sizes to + start from. + *start is updated during search and can be used to search again if 0 isn't + returned. + + @returns the next size found. *start will be incremented to the next potential + size. + @retval a large page size that is valid on this system or 0 if no large page + size possible. +*/ +#if defined(HAVE_MMAP) && !defined(_WIN32) +static size_t my_next_large_page_size(size_t sz, int *start) +{ + DBUG_ENTER("my_next_large_page_size"); + + while (*start < my_large_page_sizes_length && my_large_page_sizes[*start] > 0) + { + size_t cur= *start; + (*start)++; + if (my_large_page_sizes[cur] <= sz) + { + DBUG_RETURN(my_large_page_sizes[cur]); + } + } + DBUG_RETURN(0); +} +#endif /* defined(MMAP) || !defined(_WIN32) */ + + +int my_init_large_pages(my_bool super_large_pages) +{ +#ifdef _WIN32 + if (!my_obtain_privilege(SE_LOCK_MEMORY_NAME)) + { + my_printf_error(EE_PERM_LOCK_MEMORY, + "Lock Pages in memory access rights required for use with" + " large-pages, see https://mariadb.com/kb/en/library/" + "mariadb-memory-allocation/#huge-pages", MYF(MY_WME)); + } + my_large_page_size= GetLargePageMinimum(); +#endif + + my_use_large_pages= 1; + my_get_large_page_sizes(my_large_page_sizes); + +#ifndef HAVE_LARGE_PAGES + my_printf_error(EE_OUTOFMEMORY, "No large page support on this platform", + MYF(MY_WME)); +#endif + +#ifdef HAVE_SOLARIS_LARGE_PAGES + /* + tell the kernel that we want to use 4/256MB page for heap storage + and also for the stack. We use 4 MByte as default and if the + super-large-page is set we increase it to 256 MByte. 256 MByte + is for server installations with GBytes of RAM memory where + the MySQL Server will have page caches and other memory regions + measured in a number of GBytes. + We use as big pages as possible which isn't bigger than the above + desired page sizes. + */ + int nelem= 0; + size_t max_desired_page_size= (super_large_pages ? 256 : 4) * 1024 * 1024; + size_t max_page_size= my_next_large_page_size(max_desired_page_size, &nelem); + + if (max_page_size > 0) + { + struct memcntl_mha mpss; + + mpss.mha_cmd= MHA_MAPSIZE_BSSBRK; + mpss.mha_pagesize= max_page_size; + mpss.mha_flags= 0; + if (memcntl(NULL, 0, MC_HAT_ADVISE, (caddr_t) &mpss, 0, 0)) + { + my_error(EE_MEMCNTL, MYF(ME_WARNING | ME_ERROR_LOG_ONLY), "MC_HAT_ADVISE", + "MHA_MAPSIZE_BSSBRK"); + } + mpss.mha_cmd= MHA_MAPSIZE_STACK; + if (memcntl(NULL, 0, MC_HAT_ADVISE, (caddr_t) &mpss, 0, 0)) + { + my_error(EE_MEMCNTL, MYF(ME_WARNING | ME_ERROR_LOG_ONLY), "MC_HAT_ADVISE", + "MHA_MAPSIZE_STACK"); + } + } +#endif /* HAVE_SOLARIS_LARGE_PAGES */ + return 0; +} + + +/** + Large page size helper. + This rounds down, if needed, the size parameter to the largest + multiple of an available large page size on the system. +*/ +void my_large_page_truncate(size_t *size) +{ + if (my_use_large_pages) + { + size_t large_page_size= 0; +#ifdef _WIN32 + large_page_size= my_large_page_size; +#elif defined(HAVE_MMAP) + int page_i= 0; + large_page_size= my_next_large_page_size(*size, &page_i); +#endif + if (large_page_size > 0) + *size-= *size % large_page_size; + } +} + + +#if defined(HAVE_MMAP) && !defined(_WIN32) +/* Solaris for example has only MAP_ANON, FreeBSD has MAP_ANONYMOUS and +MAP_ANON but MAP_ANONYMOUS is marked "for compatibility" */ +#if defined(MAP_ANONYMOUS) +#define OS_MAP_ANON MAP_ANONYMOUS +#elif defined(MAP_ANON) +#define OS_MAP_ANON MAP_ANON +#else +#error unsupported mmap - no MAP_ANON{YMOUS} +#endif +#endif /* HAVE_MMAP && !_WIN32 */ + +/** + General large pages allocator. + Tries to allocate memory from large pages pool and falls back to + my_malloc_lock() in case of failure. + Every implementation returns a zero filled buffer here. +*/ +uchar *my_large_malloc(size_t *size, myf my_flags) +{ + uchar *ptr= NULL; + +#ifdef _WIN32 + DWORD alloc_type= MEM_COMMIT | MEM_RESERVE; + size_t orig_size= *size; + DBUG_ENTER("my_large_malloc"); + + if (my_use_large_pages) + { + alloc_type|= MEM_LARGE_PAGES; + /* Align block size to my_large_page_size */ + *size= MY_ALIGN(*size, (size_t) my_large_page_size); + } + ptr= VirtualAlloc(NULL, *size, alloc_type, PAGE_READWRITE); + if (!ptr) + { + if (my_flags & MY_WME) + { + if (my_use_large_pages) + { + my_printf_error(EE_OUTOFMEMORY, + "Couldn't allocate %zu bytes (MEM_LARGE_PAGES page " + "size %zu); Windows error %lu", + MYF(ME_WARNING | ME_ERROR_LOG_ONLY), *size, + my_large_page_size, GetLastError()); + } + else + { + my_error(EE_OUTOFMEMORY, MYF(ME_BELL+ME_ERROR_LOG), *size); + } + } + if (my_use_large_pages) + { + *size= orig_size; + ptr= VirtualAlloc(NULL, *size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); + if (!ptr && my_flags & MY_WME) + { + my_error(EE_OUTOFMEMORY, MYF(ME_BELL+ME_ERROR_LOG), *size); + } + } + } +#elif defined(HAVE_MMAP) + int mapflag; + int page_i= 0; + size_t large_page_size= 0; + size_t aligned_size= *size; + DBUG_ENTER("my_large_malloc"); + + while (1) + { + mapflag= MAP_PRIVATE | OS_MAP_ANON; + if (my_use_large_pages) + { + large_page_size= my_next_large_page_size(*size, &page_i); + /* this might be 0, in which case we do a standard mmap */ + if (large_page_size) + { +#if defined(MAP_HUGETLB) /* linux 2.6.32 */ + mapflag|= MAP_HUGETLB; +#if defined(MAP_HUGE_SHIFT) /* Linux-3.8+ */ + mapflag|= my_bit_log2_size_t(large_page_size) << MAP_HUGE_SHIFT; +#else +# warning "No explicit large page (HUGETLB pages) support in Linux < 3.8" +#endif +#elif defined(MAP_ALIGNED) + mapflag|= MAP_ALIGNED(my_bit_log2_size_t(large_page_size)); +#if defined(MAP_ALIGNED_SUPER) + mapflag|= MAP_ALIGNED_SUPER; +#endif +#endif + aligned_size= MY_ALIGN(*size, (size_t) large_page_size); + } + else + { + aligned_size= *size; + } + } + ptr= mmap(NULL, aligned_size, PROT_READ | PROT_WRITE, mapflag, -1, 0); + if (ptr == (void*) -1) + { + ptr= NULL; + if (my_flags & MY_WME) + { + if (large_page_size && errno == ENOMEM) + { + my_printf_error(EE_OUTOFMEMORY, + "Couldn't allocate %zu bytes (Large/HugeTLB memory " + "page size %zu); errno %u; continuing to smaller size", + MYF(ME_WARNING | ME_ERROR_LOG_ONLY), + aligned_size, large_page_size, errno); + } + else + { + my_error(EE_OUTOFMEMORY, MYF(ME_BELL+ME_ERROR_LOG), aligned_size); + } + } + /* try next smaller memory size */ + if (large_page_size && errno == ENOMEM) + continue; + + /* other errors are more serious */ + break; + } + else /* success */ + { + if (large_page_size) + { + /* + we do need to record the adjustment so that munmap gets called with + the right size. This is only the case for HUGETLB pages. + */ + *size= aligned_size; + } + break; + } + if (large_page_size == 0) + { + break; /* no more options to try */ + } + } +#else + DBUG_RETURN(my_malloc_lock(*size, my_flags)); +#endif /* defined(HAVE_MMAP) */ + + if (ptr != NULL) + { + MEM_MAKE_DEFINED(ptr, *size); + update_malloc_size(*size, 0); + } + + DBUG_RETURN(ptr); +} + + +/** + General large pages deallocator. + Tries to deallocate memory as if it was from large pages pool and falls back + to my_free_lock() in case of failure +*/ +void my_large_free(void *ptr, size_t size) +{ + DBUG_ENTER("my_large_free"); + + /* + The following implementations can only fail if ptr was not allocated with + my_large_malloc(), i.e. my_malloc_lock() was used so we should free it + with my_free_lock() + + For ASAN, we need to explicitly unpoison this memory region because the OS + may reuse that memory for some TLS or stack variable. It will remain + poisoned if it was explicitly poisioned before release. If this happens, + we'll have hard to debug false positives like in MDEV-21239. + For valgrind, we mark it as UNDEFINED rather than NOACCESS because of the + implict reuse possiblility. + */ +#if defined(HAVE_MMAP) && !defined(_WIN32) + if (munmap(ptr, size)) + { + my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, errno); + } +#if !__has_feature(memory_sanitizer) + else + { + MEM_MAKE_ADDRESSABLE(ptr, size); + } +#endif + update_malloc_size(- (longlong) size, 0); +#elif defined(_WIN32) + /* + When RELEASE memory, the size parameter must be 0. + Do not use MEM_RELEASE with MEM_DECOMMIT. + */ + if (ptr) + { + if (!VirtualFree(ptr, 0, MEM_RELEASE)) + { + my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, + GetLastError()); + } + update_malloc_size(- (longlong) size, 0); + } +#if !__has_feature(memory_sanitizer) + else + { + MEM_MAKE_ADDRESSABLE(ptr, size); + } +#endif /* memory_sanitizer */ +#else + my_free_lock(ptr); +#endif /* HAVE_MMMAP */ + + DBUG_VOID_RETURN; +} |