/*------------------------------------------------------------------------- * * win32_shmem.c * Implement shared memory using win32 facilities * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * * IDENTIFICATION * src/backend/port/win32_shmem.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "miscadmin.h" #include "storage/dsm.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "utils/guc_hooks.h" /* * Early in a process's life, Windows asynchronously creates threads for the * process's "default thread pool" * (https://docs.microsoft.com/en-us/windows/desktop/ProcThread/thread-pools). * Occasionally, thread creation allocates a stack after * PGSharedMemoryReAttach() has released UsedShmemSegAddr and before it has * mapped shared memory at UsedShmemSegAddr. This would cause mapping to fail * if the allocator preferred the just-released region for allocating the new * thread stack. We observed such failures in some Windows Server 2016 * configurations. To give the system another region to prefer, reserve and * release an additional, protective region immediately before reserving or * releasing shared memory. The idea is that, if the allocator handed out * REGION1 pages before REGION2 pages at one occasion, it will do so whenever * both regions are free. Windows Server 2016 exhibits that behavior, and a * system behaving differently would have less need to protect * UsedShmemSegAddr. The protective region must be at least large enough for * one thread stack. However, ten times as much is less than 2% of the 32-bit * address space and is negligible relative to the 64-bit address space. */ #define PROTECTIVE_REGION_SIZE (10 * WIN32_STACK_RLIMIT) void *ShmemProtectiveRegion = NULL; HANDLE UsedShmemSegID = INVALID_HANDLE_VALUE; void *UsedShmemSegAddr = NULL; static Size UsedShmemSegSize = 0; static bool EnableLockPagesPrivilege(int elevel); static void pgwin32_SharedMemoryDelete(int status, Datum shmId); /* * Generate shared memory segment name. Expand the data directory, to generate * an identifier unique for this data directory. Then replace all backslashes * with forward slashes, since backslashes aren't permitted in global object names. * * Store the shared memory segment in the Global\ namespace (requires NT2 TSE or * 2000, but that's all we support for other reasons as well), to make sure you can't * open two postmasters in different sessions against the same data directory. * * XXX: What happens with junctions? It's only someone breaking things on purpose, * and this is still better than before, but we might want to do something about * that sometime in the future. */ static char * GetSharedMemName(void) { char *retptr; DWORD bufsize; DWORD r; char *cp; bufsize = GetFullPathName(DataDir, 0, NULL, NULL); if (bufsize == 0) elog(FATAL, "could not get size for full pathname of datadir %s: error code %lu", DataDir, GetLastError()); retptr = malloc(bufsize + 18); /* 18 for Global\PostgreSQL: */ if (retptr == NULL) elog(FATAL, "could not allocate memory for shared memory name"); strcpy(retptr, "Global\\PostgreSQL:"); r = GetFullPathName(DataDir, bufsize, retptr + 18, NULL); if (r == 0 || r > bufsize) elog(FATAL, "could not generate full pathname for datadir %s: error code %lu", DataDir, GetLastError()); /* * XXX: Intentionally overwriting the Global\ part here. This was not the * original approach, but putting it in the actual Global\ namespace * causes permission errors in a lot of cases, so we leave it in the * default namespace for now. */ for (cp = retptr; *cp; cp++) if (*cp == '\\') *cp = '/'; return retptr; } /* * PGSharedMemoryIsInUse * * Is a previously-existing shmem segment still existing and in use? * * The point of this exercise is to detect the case where a prior postmaster * crashed, but it left child backends that are still running. Therefore * we only care about shmem segments that are associated with the intended * DataDir. This is an important consideration since accidental matches of * shmem segment IDs are reasonably common. */ bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) { char *szShareMem; HANDLE hmap; szShareMem = GetSharedMemName(); hmap = OpenFileMapping(FILE_MAP_READ, FALSE, szShareMem); free(szShareMem); if (hmap == NULL) return false; CloseHandle(hmap); return true; } /* * EnableLockPagesPrivilege * * Try to acquire SeLockMemoryPrivilege so we can use large pages. */ static bool EnableLockPagesPrivilege(int elevel) { HANDLE hToken; TOKEN_PRIVILEGES tp; LUID luid; if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) { ereport(elevel, (errmsg("could not enable user right \"%s\": error code %lu", /* * translator: This is a term from Windows and should be translated to * match the Windows localization. */ _("Lock pages in memory"), GetLastError()), errdetail("Failed system call was %s.", "OpenProcessToken"))); return FALSE; } if (!LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &luid)) { ereport(elevel, (errmsg("could not enable user right \"%s\": error code %lu", _("Lock pages in memory"), GetLastError()), errdetail("Failed system call was %s.", "LookupPrivilegeValue"))); CloseHandle(hToken); return FALSE; } tp.PrivilegeCount = 1; tp.Privileges[0].Luid = luid; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; if (!AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL)) { ereport(elevel, (errmsg("could not enable user right \"%s\": error code %lu", _("Lock pages in memory"), GetLastError()), errdetail("Failed system call was %s.", "AdjustTokenPrivileges"))); CloseHandle(hToken); return FALSE; } if (GetLastError() != ERROR_SUCCESS) { if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) ereport(elevel, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("could not enable user right \"%s\"", _("Lock pages in memory")), errhint("Assign user right \"%s\" to the Windows user account which runs PostgreSQL.", _("Lock pages in memory")))); else ereport(elevel, (errmsg("could not enable user right \"%s\": error code %lu", _("Lock pages in memory"), GetLastError()), errdetail("Failed system call was %s.", "AdjustTokenPrivileges"))); CloseHandle(hToken); return FALSE; } CloseHandle(hToken); return TRUE; } /* * PGSharedMemoryCreate * * Create a shared memory segment of the given size and initialize its * standard header. */ PGShmemHeader * PGSharedMemoryCreate(Size size, PGShmemHeader **shim) { void *memAddress; PGShmemHeader *hdr; HANDLE hmap, hmap2; char *szShareMem; int i; DWORD size_high; DWORD size_low; SIZE_T largePageSize = 0; Size orig_size = size; DWORD flProtect = PAGE_READWRITE; DWORD desiredAccess; ShmemProtectiveRegion = VirtualAlloc(NULL, PROTECTIVE_REGION_SIZE, MEM_RESERVE, PAGE_NOACCESS); if (ShmemProtectiveRegion == NULL) elog(FATAL, "could not reserve memory region: error code %lu", GetLastError()); /* Room for a header? */ Assert(size > MAXALIGN(sizeof(PGShmemHeader))); szShareMem = GetSharedMemName(); UsedShmemSegAddr = NULL; if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) { /* Does the processor support large pages? */ largePageSize = GetLargePageMinimum(); if (largePageSize == 0) { ereport(huge_pages == HUGE_PAGES_ON ? FATAL : DEBUG1, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("the processor does not support large pages"))); ereport(DEBUG1, (errmsg_internal("disabling huge pages"))); } else if (!EnableLockPagesPrivilege(huge_pages == HUGE_PAGES_ON ? FATAL : DEBUG1)) { ereport(DEBUG1, (errmsg_internal("disabling huge pages"))); } else { /* Huge pages available and privilege enabled, so turn on */ flProtect = PAGE_READWRITE | SEC_COMMIT | SEC_LARGE_PAGES; /* Round size up as appropriate. */ if (size % largePageSize != 0) size += largePageSize - (size % largePageSize); } } retry: #ifdef _WIN64 size_high = size >> 32; #else size_high = 0; #endif size_low = (DWORD) size; /* * When recycling a shared memory segment, it may take a short while * before it gets dropped from the global namespace. So re-try after * sleeping for a second, and continue retrying 10 times. (both the 1 * second time and the 10 retries are completely arbitrary) */ for (i = 0; i < 10; i++) { /* * In case CreateFileMapping() doesn't set the error code to 0 on * success */ SetLastError(0); hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */ NULL, /* Default security attrs */ flProtect, size_high, /* Size Upper 32 Bits */ size_low, /* Size Lower 32 bits */ szShareMem); if (!hmap) { if (GetLastError() == ERROR_NO_SYSTEM_RESOURCES && huge_pages == HUGE_PAGES_TRY && (flProtect & SEC_LARGE_PAGES) != 0) { elog(DEBUG1, "CreateFileMapping(%zu) with SEC_LARGE_PAGES failed, " "huge pages disabled", size); /* * Use the original size, not the rounded-up value, when * falling back to non-huge pages. */ size = orig_size; flProtect = PAGE_READWRITE; goto retry; } else ereport(FATAL, (errmsg("could not create shared memory segment: error code %lu", GetLastError()), errdetail("Failed system call was CreateFileMapping(size=%zu, name=%s).", size, szShareMem))); } /* * If the segment already existed, CreateFileMapping() will return a * handle to the existing one and set ERROR_ALREADY_EXISTS. */ if (GetLastError() == ERROR_ALREADY_EXISTS) { CloseHandle(hmap); /* Close the handle, since we got a valid one * to the previous segment. */ hmap = NULL; Sleep(1000); continue; } break; } /* * If the last call in the loop still returned ERROR_ALREADY_EXISTS, this * shared memory segment exists and we assume it belongs to somebody else. */ if (!hmap) ereport(FATAL, (errmsg("pre-existing shared memory block is still in use"), errhint("Check if there are any old server processes still running, and terminate them."))); free(szShareMem); /* * Make the handle inheritable */ if (!DuplicateHandle(GetCurrentProcess(), hmap, GetCurrentProcess(), &hmap2, 0, TRUE, DUPLICATE_SAME_ACCESS)) ereport(FATAL, (errmsg("could not create shared memory segment: error code %lu", GetLastError()), errdetail("Failed system call was DuplicateHandle."))); /* * Close the old, non-inheritable handle. If this fails we don't really * care. */ if (!CloseHandle(hmap)) elog(LOG, "could not close handle to shared memory: error code %lu", GetLastError()); desiredAccess = FILE_MAP_WRITE | FILE_MAP_READ; #ifdef FILE_MAP_LARGE_PAGES /* Set large pages if wanted. */ if ((flProtect & SEC_LARGE_PAGES) != 0) desiredAccess |= FILE_MAP_LARGE_PAGES; #endif /* * Get a pointer to the new shared memory segment. Map the whole segment * at once, and let the system decide on the initial address. */ memAddress = MapViewOfFileEx(hmap2, desiredAccess, 0, 0, 0, NULL); if (!memAddress) ereport(FATAL, (errmsg("could not create shared memory segment: error code %lu", GetLastError()), errdetail("Failed system call was MapViewOfFileEx."))); /* * OK, we created a new segment. Mark it as created by this process. The * order of assignments here is critical so that another Postgres process * can't see the header as valid but belonging to an invalid PID! */ hdr = (PGShmemHeader *) memAddress; hdr->creatorPID = getpid(); hdr->magic = PGShmemMagic; /* * Initialize space allocation status for segment. */ hdr->totalsize = size; hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); hdr->dsm_control = 0; /* Save info for possible future use */ UsedShmemSegAddr = memAddress; UsedShmemSegSize = size; UsedShmemSegID = hmap2; /* Register on-exit routine to delete the new segment */ on_shmem_exit(pgwin32_SharedMemoryDelete, PointerGetDatum(hmap2)); *shim = hdr; return hdr; } /* * PGSharedMemoryReAttach * * This is called during startup of a postmaster child process to re-attach to * an already existing shared memory segment, using the handle inherited from * the postmaster. * * ShmemProtectiveRegion, UsedShmemSegID and UsedShmemSegAddr are implicit * parameters to this routine. The caller must have already restored them to * the postmaster's values. */ void PGSharedMemoryReAttach(void) { PGShmemHeader *hdr; void *origUsedShmemSegAddr = UsedShmemSegAddr; Assert(ShmemProtectiveRegion != NULL); Assert(UsedShmemSegAddr != NULL); Assert(IsUnderPostmaster); /* * Release memory region reservations made by the postmaster */ if (VirtualFree(ShmemProtectiveRegion, 0, MEM_RELEASE) == 0) elog(FATAL, "failed to release reserved memory region (addr=%p): error code %lu", ShmemProtectiveRegion, GetLastError()); if (VirtualFree(UsedShmemSegAddr, 0, MEM_RELEASE) == 0) elog(FATAL, "failed to release reserved memory region (addr=%p): error code %lu", UsedShmemSegAddr, GetLastError()); hdr = (PGShmemHeader *) MapViewOfFileEx(UsedShmemSegID, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, 0, UsedShmemSegAddr); if (!hdr) elog(FATAL, "could not reattach to shared memory (key=%p, addr=%p): error code %lu", UsedShmemSegID, UsedShmemSegAddr, GetLastError()); if (hdr != origUsedShmemSegAddr) elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)", hdr, origUsedShmemSegAddr); if (hdr->magic != PGShmemMagic) elog(FATAL, "reattaching to shared memory returned non-PostgreSQL memory"); dsm_set_control_handle(hdr->dsm_control); UsedShmemSegAddr = hdr; /* probably redundant */ } /* * PGSharedMemoryNoReAttach * * This is called during startup of a postmaster child process when we choose * *not* to re-attach to the existing shared memory segment. We must clean up * to leave things in the appropriate state. * * The child process startup logic might or might not call PGSharedMemoryDetach * after this; make sure that it will be a no-op if called. * * ShmemProtectiveRegion, UsedShmemSegID and UsedShmemSegAddr are implicit * parameters to this routine. The caller must have already restored them to * the postmaster's values. */ void PGSharedMemoryNoReAttach(void) { Assert(ShmemProtectiveRegion != NULL); Assert(UsedShmemSegAddr != NULL); Assert(IsUnderPostmaster); /* * Under Windows we will not have mapped the segment, so we don't need to * un-map it. Just reset UsedShmemSegAddr to show we're not attached. */ UsedShmemSegAddr = NULL; /* * We *must* close the inherited shmem segment handle, else Windows will * consider the existence of this process to mean it can't release the * shmem segment yet. We can now use PGSharedMemoryDetach to do that. */ PGSharedMemoryDetach(); } /* * PGSharedMemoryDetach * * Detach from the shared memory segment, if still attached. This is not * intended to be called explicitly by the process that originally created the * segment (it will have an on_shmem_exit callback registered to do that). * Rather, this is for subprocesses that have inherited an attachment and want * to get rid of it. * * ShmemProtectiveRegion, UsedShmemSegID and UsedShmemSegAddr are implicit * parameters to this routine. */ void PGSharedMemoryDetach(void) { /* * Releasing the protective region liberates an unimportant quantity of * address space, but be tidy. */ if (ShmemProtectiveRegion != NULL) { if (VirtualFree(ShmemProtectiveRegion, 0, MEM_RELEASE) == 0) elog(LOG, "failed to release reserved memory region (addr=%p): error code %lu", ShmemProtectiveRegion, GetLastError()); ShmemProtectiveRegion = NULL; } /* Unmap the view, if it's mapped */ if (UsedShmemSegAddr != NULL) { if (!UnmapViewOfFile(UsedShmemSegAddr)) elog(LOG, "could not unmap view of shared memory: error code %lu", GetLastError()); UsedShmemSegAddr = NULL; } /* And close the shmem handle, if we have one */ if (UsedShmemSegID != INVALID_HANDLE_VALUE) { if (!CloseHandle(UsedShmemSegID)) elog(LOG, "could not close handle to shared memory: error code %lu", GetLastError()); UsedShmemSegID = INVALID_HANDLE_VALUE; } } /* * pgwin32_SharedMemoryDelete * * Detach from and delete the shared memory segment * (called as an on_shmem_exit callback, hence funny argument list) */ static void pgwin32_SharedMemoryDelete(int status, Datum shmId) { Assert(DatumGetPointer(shmId) == UsedShmemSegID); PGSharedMemoryDetach(); } /* * pgwin32_ReserveSharedMemoryRegion(hChild) * * Reserve the memory region that will be used for shared memory in a child * process. It is called before the child process starts, to make sure the * memory is available. * * Once the child starts, DLLs loading in different order or threads getting * scheduled differently may allocate memory which can conflict with the * address space we need for our shared memory. By reserving the shared * memory region before the child starts, and freeing it only just before we * attempt to get access to the shared memory forces these allocations to * be given different address ranges that don't conflict. * * NOTE! This function executes in the postmaster, and should for this * reason not use elog(FATAL) since that would take down the postmaster. */ int pgwin32_ReserveSharedMemoryRegion(HANDLE hChild) { void *address; Assert(ShmemProtectiveRegion != NULL); Assert(UsedShmemSegAddr != NULL); Assert(UsedShmemSegSize != 0); /* ShmemProtectiveRegion */ address = VirtualAllocEx(hChild, ShmemProtectiveRegion, PROTECTIVE_REGION_SIZE, MEM_RESERVE, PAGE_NOACCESS); if (address == NULL) { /* Don't use FATAL since we're running in the postmaster */ elog(LOG, "could not reserve shared memory region (addr=%p) for child %p: error code %lu", ShmemProtectiveRegion, hChild, GetLastError()); return false; } if (address != ShmemProtectiveRegion) { /* * Should never happen - in theory if allocation granularity causes * strange effects it could, so check just in case. * * Don't use FATAL since we're running in the postmaster. */ elog(LOG, "reserved shared memory region got incorrect address %p, expected %p", address, ShmemProtectiveRegion); return false; } /* UsedShmemSegAddr */ address = VirtualAllocEx(hChild, UsedShmemSegAddr, UsedShmemSegSize, MEM_RESERVE, PAGE_READWRITE); if (address == NULL) { elog(LOG, "could not reserve shared memory region (addr=%p) for child %p: error code %lu", UsedShmemSegAddr, hChild, GetLastError()); return false; } if (address != UsedShmemSegAddr) { elog(LOG, "reserved shared memory region got incorrect address %p, expected %p", address, UsedShmemSegAddr); return false; } return true; } /* * This function is provided for consistency with sysv_shmem.c and does not * provide any useful information for Windows. To obtain the large page size, * use GetLargePageMinimum() instead. */ void GetHugePageSize(Size *hugepagesize, int *mmap_flags) { if (hugepagesize) *hugepagesize = 0; if (mmap_flags) *mmap_flags = 0; } /* * GUC check_hook for huge_page_size */ bool check_huge_page_size(int *newval, void **extra, GucSource source) { if (*newval != 0) { GUC_check_errdetail("huge_page_size must be 0 on this platform."); return false; } return true; }