diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/port/win | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/port/win')
-rw-r--r-- | src/rocksdb/port/win/env_default.cc | 45 | ||||
-rw-r--r-- | src/rocksdb/port/win/env_win.cc | 1437 | ||||
-rw-r--r-- | src/rocksdb/port/win/env_win.h | 304 | ||||
-rw-r--r-- | src/rocksdb/port/win/io_win.cc | 1101 | ||||
-rw-r--r-- | src/rocksdb/port/win/io_win.h | 508 | ||||
-rw-r--r-- | src/rocksdb/port/win/port_win.cc | 303 | ||||
-rw-r--r-- | src/rocksdb/port/win/port_win.h | 378 | ||||
-rw-r--r-- | src/rocksdb/port/win/win_jemalloc.cc | 80 | ||||
-rw-r--r-- | src/rocksdb/port/win/win_logger.cc | 192 | ||||
-rw-r--r-- | src/rocksdb/port/win/win_logger.h | 64 | ||||
-rw-r--r-- | src/rocksdb/port/win/win_thread.cc | 170 | ||||
-rw-r--r-- | src/rocksdb/port/win/win_thread.h | 117 | ||||
-rw-r--r-- | src/rocksdb/port/win/xpress_win.cc | 210 | ||||
-rw-r--r-- | src/rocksdb/port/win/xpress_win.h | 26 |
14 files changed, 4935 insertions, 0 deletions
diff --git a/src/rocksdb/port/win/env_default.cc b/src/rocksdb/port/win/env_default.cc new file mode 100644 index 000000000..48853f26e --- /dev/null +++ b/src/rocksdb/port/win/env_default.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include <mutex> + +#include "port/win/env_win.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "util/compression_context_cache.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// We choose not to destroy the env because joining the threads from the +// system loader +// which destroys the statics (same as from DLLMain) creates a system loader +// dead-lock. +// in this manner any remaining threads are terminated OK. +namespace { +std::once_flag winenv_once_flag; +Env* envptr; +}; // namespace +} // namespace port + +Env* Env::Default() { + ThreadLocalPtr::InitSingletons(); + CompressionContextCache::InitSingleton(); + INIT_SYNC_POINT_SINGLETONS(); + std::call_once(port::winenv_once_flag, + []() { port::envptr = new port::WinEnv(); }); + return port::envptr; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/port/win/env_win.cc b/src/rocksdb/port/win/env_win.cc new file mode 100644 index 000000000..2262eb59c --- /dev/null +++ b/src/rocksdb/port/win/env_win.cc @@ -0,0 +1,1437 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include "port/win/env_win.h" + +#include <direct.h> // _rmdir, _mkdir, _getcwd +#include <errno.h> +#include <io.h> // _access +#include <rpc.h> // for uuid generation +#include <shlwapi.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <windows.h> +#include <winioctl.h> + +#include <algorithm> +#include <ctime> +#include <thread> + +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "port/lang.h" +#include "port/port.h" +#include "port/port_dirent.h" +#include "port/win/io_win.h" +#include "port/win/win_logger.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "strsafe.h" +#include "util/string_util.h" + +// Undefine the functions windows might use (again)... +#undef GetCurrentTime +#undef DeleteFile +#undef LoadLibrary + +namespace ROCKSDB_NAMESPACE { + +ThreadStatusUpdater* CreateThreadStatusUpdater() { + return new ThreadStatusUpdater(); +} + +namespace { + +// Sector size used when physical sector size cannot be obtained from device. +static const size_t kSectorSize = 512; + +// RAII helpers for HANDLEs +const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); }; +using UniqueCloseHandlePtr = std::unique_ptr<void, decltype(CloseHandleFunc)>; + +const auto FindCloseFunc = [](HANDLE h) { ::FindClose(h); }; +using UniqueFindClosePtr = std::unique_ptr<void, decltype(FindCloseFunc)>; + +void WinthreadCall(const char* label, std::error_code result) { + if (0 != result.value()) { + fprintf(stderr, "Winthread %s: %s\n", label, + errnoStr(result.value()).c_str()); + abort(); + } +} + +} // namespace + +namespace port { +WinClock::WinClock() + : perf_counter_frequency_(0), + nano_seconds_per_period_(0), + GetSystemTimePreciseAsFileTime_(NULL) { + { + LARGE_INTEGER qpf; + BOOL ret __attribute__((__unused__)); + ret = QueryPerformanceFrequency(&qpf); + assert(ret == TRUE); + perf_counter_frequency_ = qpf.QuadPart; + + if (std::nano::den % perf_counter_frequency_ == 0) { + nano_seconds_per_period_ = std::nano::den / perf_counter_frequency_; + } + } + + HMODULE module = GetModuleHandle("kernel32.dll"); + if (module != NULL) { + GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)( + void*)GetProcAddress(module, "GetSystemTimePreciseAsFileTime"); + } +} + +void WinClock::SleepForMicroseconds(int micros) { + std::this_thread::sleep_for(std::chrono::microseconds(micros)); +} + +std::string WinClock::TimeToString(uint64_t secondsSince1970) { + std::string result; + + const time_t seconds = secondsSince1970; + const int maxsize = 64; + + struct tm t; + errno_t ret = localtime_s(&t, &seconds); + + if (ret) { + result = std::to_string(seconds); + } else { + result.resize(maxsize); + char* p = &result[0]; + + int len = + snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); + assert(len > 0); + + result.resize(len); + } + + return result; +} + +uint64_t WinClock::NowMicros() { + if (GetSystemTimePreciseAsFileTime_ != NULL) { + // all std::chrono clocks on windows proved to return + // values that may repeat that is not good enough for some uses. + const int64_t c_UnixEpochStartTicks = 116444736000000000LL; + const int64_t c_FtToMicroSec = 10; + + // This interface needs to return system time and not + // just any microseconds because it is often used as an argument + // to TimedWait() on condition variable + FILETIME ftSystemTime; + GetSystemTimePreciseAsFileTime_(&ftSystemTime); + + LARGE_INTEGER li; + li.LowPart = ftSystemTime.dwLowDateTime; + li.HighPart = ftSystemTime.dwHighDateTime; + // Subtract unix epoch start + li.QuadPart -= c_UnixEpochStartTicks; + // Convert to microsecs + li.QuadPart /= c_FtToMicroSec; + return li.QuadPart; + } + return std::chrono::duration_cast<std::chrono::microseconds>( + std::chrono::system_clock::now().time_since_epoch()) + .count(); +} + +uint64_t WinClock::NowNanos() { + if (nano_seconds_per_period_ != 0) { + // all std::chrono clocks on windows have the same resolution that is only + // good enough for microseconds but not nanoseconds + // On Windows 8 and Windows 2012 Server + // GetSystemTimePreciseAsFileTime(¤t_time) can be used + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + // Convert performance counter to nanoseconds by precomputed ratio. + // Directly multiply nano::den with li.QuadPart causes overflow. + // Only do this when nano::den is divisible by perf_counter_frequency_, + // which most likely is the case in reality. If it's not, fall back to + // high_resolution_clock, which may be less precise under old compilers. + li.QuadPart *= nano_seconds_per_period_; + return li.QuadPart; + } + return std::chrono::duration_cast<std::chrono::nanoseconds>( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); +} + +Status WinClock::GetCurrentTime(int64_t* unix_time) { + time_t time = std::time(nullptr); + if (time == (time_t)(-1)) { + return Status::NotSupported("Failed to get time"); + } + + *unix_time = time; + return Status::OK(); +} + +WinFileSystem::WinFileSystem(const std::shared_ptr<SystemClock>& clock) + : clock_(clock), page_size_(4 * 1024), allocation_granularity_(page_size_) { + SYSTEM_INFO sinfo; + GetSystemInfo(&sinfo); + + page_size_ = sinfo.dwPageSize; + allocation_granularity_ = sinfo.dwAllocationGranularity; +} + +const std::shared_ptr<WinFileSystem>& WinFileSystem::Default() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr<WinFileSystem>, fs) + (std::make_shared<WinFileSystem>(WinClock::Default())); + return fs; +} + +WinEnvIO::WinEnvIO(Env* hosted_env) : hosted_env_(hosted_env) {} + +WinEnvIO::~WinEnvIO() {} + +IOStatus WinFileSystem::DeleteFile(const std::string& fname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + BOOL ret = RX_DeleteFile(RX_FN(fname).c_str()); + + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to delete: " + fname, lastError); + } + + return result; +} + +IOStatus WinFileSystem::Truncate(const std::string& fname, size_t size, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size); + if (result != 0) { + s = IOError("Failed to truncate: " + fname, errno); + } + return s; +} + +IOStatus WinFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) { + IOStatus s; + + result->reset(); + + // Corruption test needs to rename and delete files of these kind + // while they are still open with another handle. For that reason we + // allow share_write and delete(allows rename). + HANDLE hFile = INVALID_HANDLE_VALUE; + + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; + + if (options.use_direct_reads && !options.use_mmap_reads) { + fileFlags |= FILE_FLAG_NO_BUFFERING; + } + + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, + OPEN_EXISTING, // Original fopen mode is "rb" + fileFlags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname, + lastError); + } else { + result->reset(new WinSequentialFile(fname, hFile, options)); + } + return s; +} + +IOStatus WinFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) { + result->reset(); + IOStatus s; + + // Open the file for read-only random access + // Random access is to disable read-ahead as the system reads too much data + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; + + if (options.use_direct_reads && !options.use_mmap_reads) { + fileFlags |= FILE_FLAG_NO_BUFFERING; + } else { + fileFlags |= FILE_FLAG_RANDOM_ACCESS; + } + + /// Shared access is necessary for corruption test to pass + // almost all tests would work with a possible exception of fault_injection + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = + RX_CreateFile(RX_FN(fname).c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, OPEN_EXISTING, fileFlags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "NewRandomAccessFile failed to Create/Open: " + fname, lastError); + } + + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); + + // CAUTION! This will map the entire file into the process address space. + // Not recommended for 32-bit platforms. + if (options.use_mmap_reads) { + uint64_t fileSize; + + s = GetFileSize(fname, IOOptions(), &fileSize, dbg); + + if (s.ok()) { + // Will not map empty files + if (fileSize == 0) { + return IOError("NewRandomAccessFile failed to map empty file: " + fname, + EINVAL); + } + + HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY, + 0, // At its present length + 0, + NULL); // Mapping name + + if (!hMap) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create file mapping for NewRandomAccessFile: " + fname, + lastError); + } + + UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); + + const void* mapped_region = + MapViewOfFileEx(hMap, FILE_MAP_READ, + 0, // High DWORD of access start + 0, // Low DWORD + static_cast<SIZE_T>(fileSize), + NULL); // Let the OS choose the mapping + + if (!mapped_region) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to MapViewOfFile for NewRandomAccessFile: " + fname, + lastError); + } + + result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region, + static_cast<size_t>(fileSize))); + + mapGuard.release(); + fileGuard.release(); + } + } else { + result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options)); + fileGuard.release(); + } + return s; +} + +IOStatus WinFileSystem::OpenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, bool reopen) { + const size_t c_BufferCapacity = 64 * 1024; + + EnvOptions local_options(options); + + result->reset(); + IOStatus s; + + DWORD fileFlags = FILE_ATTRIBUTE_NORMAL; + + if (local_options.use_direct_writes && !local_options.use_mmap_writes) { + fileFlags = FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; + } + + // Desired access. We are want to write only here but if we want to memory + // map + // the file then there is no write only mode so we have to create it + // Read/Write + // However, MapViewOfFile specifies only Write only + DWORD desired_access = GENERIC_WRITE; + DWORD shared_mode = FILE_SHARE_READ; + + if (local_options.use_mmap_writes) { + desired_access |= GENERIC_READ; + } else { + // Adding this solely for tests to pass (fault_injection_test, + // wal_manager_test). + shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE); + } + + // This will always truncate the file + DWORD creation_disposition = CREATE_ALWAYS; + if (reopen) { + creation_disposition = OPEN_ALWAYS; + } + + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), + desired_access, // Access desired + shared_mode, + NULL, // Security attributes + // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC + creation_disposition, + fileFlags, // Flags + NULL); // Template File + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create a NewWritableFile: " + fname, lastError); + } + + // We will start writing at the end, appending + if (reopen) { + LARGE_INTEGER zero_move; + zero_move.QuadPart = 0; + BOOL ret = SetFilePointerEx(hFile, zero_move, NULL, FILE_END); + if (!ret) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create a ReopenWritableFile move to the end: " + fname, + lastError); + } + } + + if (options.use_mmap_writes) { + // We usually do not use mmmapping on SSD and thus we pass memory + // page_size + result->reset(new WinMmapFile(fname, hFile, page_size_, + allocation_granularity_, local_options)); + } else { + // Here we want the buffer allocation to be aligned by the SSD page size + // and to be a multiple of it + result->reset(new WinWritableFile(fname, hFile, GetPageSize(), + c_BufferCapacity, local_options)); + } + return s; +} + +IOStatus WinFileSystem::NewWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* /*dbg*/) { + return OpenWritableFile(fname, options, result, false); +} + +IOStatus WinFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) { + return OpenWritableFile(fname, options, result, true); +} + +IOStatus WinFileSystem::NewRandomRWFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSRandomRWFile>* result, + IODebugContext* /*dbg*/) { + IOStatus s; + + // Open the file for read-only random access + // Random access is to disable read-ahead as the system reads too much data + DWORD desired_access = GENERIC_READ | GENERIC_WRITE; + DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; + DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist + DWORD file_flags = FILE_FLAG_RANDOM_ACCESS; + + if (options.use_direct_reads && options.use_direct_writes) { + file_flags |= FILE_FLAG_NO_BUFFERING; + } + + /// Shared access is necessary for corruption test to pass + // almost all tests would work with a possible exception of fault_injection + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile(RX_FN(fname).c_str(), desired_access, shared_mode, + NULL, // Security attributes + creation_disposition, file_flags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "NewRandomRWFile failed to Create/Open: " + fname, lastError); + } + + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); + result->reset(new WinRandomRWFile(fname, hFile, GetPageSize(), options)); + fileGuard.release(); + + return s; +} + +IOStatus WinFileSystem::NewMemoryMappedFileBuffer( + const std::string& fname, std::unique_ptr<MemoryMappedFileBuffer>* result) { + IOStatus s; + result->reset(); + + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; + + HANDLE hFile = INVALID_HANDLE_VALUE; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, + OPEN_EXISTING, // Open only if it exists + fileFlags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to open NewMemoryMappedFileBuffer: " + fname, lastError); + return s; + } + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); + + uint64_t fileSize = 0; + s = GetFileSize(fname, IOOptions(), &fileSize, nullptr); + if (!s.ok()) { + return s; + } + // Will not map empty files + if (fileSize == 0) { + return IOStatus::NotSupported( + "NewMemoryMappedFileBuffer can not map zero length files: " + fname); + } + + // size_t is 32-bit with 32-bit builds + if (fileSize > std::numeric_limits<size_t>::max()) { + return IOStatus::NotSupported( + "The specified file size does not fit into 32-bit memory addressing: " + + fname); + } + + HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE, + 0, // Whole file at its present length + 0, + NULL); // Mapping name + + if (!hMap) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create file mapping for: " + fname, lastError); + } + UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); + + void* base = MapViewOfFileEx(hMap, FILE_MAP_WRITE, + 0, // High DWORD of access start + 0, // Low DWORD + static_cast<SIZE_T>(fileSize), + NULL); // Let the OS choose the mapping + + if (!base) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to MapViewOfFile for NewMemoryMappedFileBuffer: " + fname, + lastError); + } + + result->reset(new WinMemoryMappedBuffer(hFile, hMap, base, + static_cast<size_t>(fileSize))); + + mapGuard.release(); + fileGuard.release(); + + return s; +} + +IOStatus WinFileSystem::NewDirectory(const std::string& name, + const IOOptions& /*options*/, + std::unique_ptr<FSDirectory>* result, + IODebugContext* /*dbg*/) { + IOStatus s; + // Must be nullptr on failure + result->reset(); + + if (!DirExists(name)) { + s = IOErrorFromWindowsError("open folder: " + name, ERROR_DIRECTORY); + return s; + } + + HANDLE handle = INVALID_HANDLE_VALUE; + // 0 - for access means read metadata + { + IOSTATS_TIMER_GUARD(open_nanos); + handle = RX_CreateFile( + RX_FN(name).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + } + + if (INVALID_HANDLE_VALUE == handle) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("open folder: " + name, lastError); + return s; + } + + result->reset(new WinDirectory(name, handle)); + + return s; +} + +IOStatus WinFileSystem::FileExists(const std::string& fname, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; + // TODO: This does not follow symbolic links at this point + // which is consistent with _access() impl on windows + // but can be added + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (FALSE == RX_GetFileAttributesEx(RX_FN(fname).c_str(), + GetFileExInfoStandard, &attrs)) { + auto lastError = GetLastError(); + switch (lastError) { + case ERROR_ACCESS_DENIED: + case ERROR_NOT_FOUND: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + s = IOStatus::NotFound(); + break; + default: + s = IOErrorFromWindowsError("Unexpected error for: " + fname, + lastError); + break; + } + } + return s; +} + +IOStatus WinFileSystem::GetChildren(const std::string& dir, + const IOOptions& /*opts*/, + std::vector<std::string>* result, + IODebugContext* /*dbg*/) { + IOStatus status; + result->clear(); + + RX_WIN32_FIND_DATA data; + memset(&data, 0, sizeof(data)); + std::string pattern(dir); + pattern.append("\\").append("*"); + + HANDLE handle = + RX_FindFirstFileEx(RX_FN(pattern).c_str(), + // Do not want alternative name + FindExInfoBasic, &data, FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); + + if (handle == INVALID_HANDLE_VALUE) { + auto lastError = GetLastError(); + switch (lastError) { + case ERROR_NOT_FOUND: + case ERROR_ACCESS_DENIED: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + status = IOStatus::NotFound(); + break; + default: + status = IOErrorFromWindowsError("Failed to GetChhildren for: " + dir, + lastError); + } + return status; + } + + UniqueFindClosePtr fc(handle, FindCloseFunc); + + // For safety + data.cFileName[MAX_PATH - 1] = 0; + + while (true) { + // filter out '.' and '..' directory entries + // which appear only on some platforms + const bool ignore = + ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0) && + (RX_FNCMP(data.cFileName, ".") == 0 || + RX_FNCMP(data.cFileName, "..") == 0); + if (!ignore) { + auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName)); + result->push_back(FN_TO_RX(x)); + } + + BOOL ret = -RX_FindNextFile(handle, &data); + // If the function fails the return value is zero + // and non-zero otherwise. Not TRUE or FALSE. + if (ret == FALSE) { + // Posix does not care why we stopped + break; + } + data.cFileName[MAX_PATH - 1] = 0; + } + return status; +} + +IOStatus WinFileSystem::CreateDir(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL); + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to create a directory: " + name, + lastError); + } + + return result; +} + +IOStatus WinFileSystem::CreateDirIfMissing(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + if (DirExists(name)) { + return result; + } + + BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL); + if (!ret) { + auto lastError = GetLastError(); + if (lastError != ERROR_ALREADY_EXISTS) { + result = IOErrorFromWindowsError("Failed to create a directory: " + name, + lastError); + } else { + result = IOStatus::IOError(name + ": exists but is not a directory"); + } + } + return result; +} + +IOStatus WinFileSystem::DeleteDir(const std::string& name, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus result; + BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str()); + if (!ret) { + auto lastError = GetLastError(); + result = + IOErrorFromWindowsError("Failed to remove dir: " + name, lastError); + } + return result; +} + +IOStatus WinFileSystem::GetFileSize(const std::string& fname, + const IOOptions& /*opts*/, uint64_t* size, + IODebugContext* /*dbg*/) { + IOStatus s; + + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, + &attrs)) { + ULARGE_INTEGER file_size; + file_size.HighPart = attrs.nFileSizeHigh; + file_size.LowPart = attrs.nFileSizeLow; + *size = file_size.QuadPart; + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError); + } + return s; +} + +uint64_t WinFileSystem::FileTimeToUnixTime(const FILETIME& ftTime) { + const uint64_t c_FileTimePerSecond = 10000000U; + // UNIX epoch starts on 1970-01-01T00:00:00Z + // Windows FILETIME starts on 1601-01-01T00:00:00Z + // Therefore, we need to subtract the below number of seconds from + // the seconds that we obtain from FILETIME with an obvious loss of + // precision + const uint64_t c_SecondBeforeUnixEpoch = 11644473600U; + + ULARGE_INTEGER li; + li.HighPart = ftTime.dwHighDateTime; + li.LowPart = ftTime.dwLowDateTime; + + uint64_t result = + (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch; + return result; +} + +IOStatus WinFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& /*opts*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) { + IOStatus s; + + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, + &attrs)) { + *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime); + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Can not get file modification time for: " + fname, lastError); + *file_mtime = 0; + } + + return s; +} + +IOStatus WinFileSystem::RenameFile(const std::string& src, + const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + // rename() is not capable of replacing the existing file as on Linux + // so use OS API directly + if (!RX_MoveFileEx(RX_FN(src).c_str(), RX_FN(target).c_str(), + MOVEFILE_REPLACE_EXISTING)) { + DWORD lastError = GetLastError(); + + std::string text("Failed to rename: "); + text.append(src).append(" to: ").append(target); + + result = IOErrorFromWindowsError(text, lastError); + } + + return result; +} + +IOStatus WinFileSystem::LinkFile(const std::string& src, + const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) { + DWORD lastError = GetLastError(); + if (lastError == ERROR_NOT_SAME_DEVICE) { + return IOStatus::NotSupported("No cross FS links allowed"); + } + + std::string text("Failed to link: "); + text.append(src).append(" to: ").append(target); + + result = IOErrorFromWindowsError(text, lastError); + } + + return result; +} + +IOStatus WinFileSystem::NumFileLinks(const std::string& fname, + const IOOptions& /*opts*/, uint64_t* count, + IODebugContext* /*dbg*/) { + IOStatus s; + HANDLE handle = + RX_CreateFile(RX_FN(fname).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL); + + if (INVALID_HANDLE_VALUE == handle) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("NumFileLinks: " + fname, lastError); + return s; + } + UniqueCloseHandlePtr handle_guard(handle, CloseHandleFunc); + FILE_STANDARD_INFO standard_info; + if (0 != GetFileInformationByHandleEx(handle, FileStandardInfo, + &standard_info, + sizeof(standard_info))) { + *count = standard_info.NumberOfLinks; + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("GetFileInformationByHandleEx: " + fname, + lastError); + } + return s; +} + +IOStatus WinFileSystem::AreFilesSame(const std::string& first, + const std::string& second, + const IOOptions& /*opts*/, bool* res, + IODebugContext* /*dbg*/) { +// For MinGW builds +#if (_WIN32_WINNT == _WIN32_WINNT_VISTA) + IOStatus s = IOStatus::NotSupported(); +#else + assert(res != nullptr); + IOStatus s; + if (res == nullptr) { + s = IOStatus::InvalidArgument("res"); + return s; + } + + // 0 - for access means read metadata + HANDLE file_1 = RX_CreateFile( + RX_FN(first).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + + if (INVALID_HANDLE_VALUE == file_1) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("open file: " + first, lastError); + return s; + } + UniqueCloseHandlePtr g_1(file_1, CloseHandleFunc); + + HANDLE file_2 = RX_CreateFile( + RX_FN(second).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + + if (INVALID_HANDLE_VALUE == file_2) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("open file: " + second, lastError); + return s; + } + UniqueCloseHandlePtr g_2(file_2, CloseHandleFunc); + + FILE_ID_INFO FileInfo_1; + BOOL result = GetFileInformationByHandleEx(file_1, FileIdInfo, &FileInfo_1, + sizeof(FileInfo_1)); + + if (!result) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("stat file: " + first, lastError); + return s; + } + + FILE_ID_INFO FileInfo_2; + result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2, + sizeof(FileInfo_2)); + + if (!result) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("stat file: " + second, lastError); + return s; + } + + if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) { + *res = + (0 == memcmp(FileInfo_1.FileId.Identifier, FileInfo_2.FileId.Identifier, + sizeof(FileInfo_1.FileId.Identifier))); + } else { + *res = false; + } +#endif + return s; +} + +IOStatus WinFileSystem::LockFile(const std::string& lockFname, + const IOOptions& /*opts*/, FileLock** lock, + IODebugContext* /*dbg*/) { + assert(lock != nullptr); + + *lock = NULL; + IOStatus result; + + // No-sharing, this is a LOCK file + const DWORD ExclusiveAccessON = 0; + + // Obtain exclusive access to the LOCK file + // Previously, instead of NORMAL attr we set DELETE on close and that worked + // well except with fault_injection test that insists on deleting it. + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile(RX_FN(lockFname).c_str(), + (GENERIC_READ | GENERIC_WRITE), ExclusiveAccessON, + NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to create lock file: " + lockFname, + lastError); + } else { + *lock = new WinFileLock(hFile); + } + + return result; +} + +IOStatus WinFileSystem::UnlockFile(FileLock* lock, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + assert(lock != nullptr); + + delete lock; + + return result; +} + +IOStatus WinFileSystem::GetTestDirectory(const IOOptions& opts, + std::string* result, + IODebugContext* dbg) { + std::string output; + + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + output = env; + } else { + env = getenv("TMP"); + + if (env && env[0] != '\0') { + output = env; + } else { + output = "c:\\tmp"; + } + } + CreateDir(output, opts, dbg); + + output.append("\\testrocksdb-"); + output.append(std::to_string(GetCurrentProcessId())); + + CreateDir(output, opts, dbg); + + output.swap(*result); + + return IOStatus::OK(); +} + +IOStatus WinFileSystem::NewLogger(const std::string& fname, + const IOOptions& /*opts*/, + std::shared_ptr<Logger>* result, + IODebugContext* /*dbg*/) { + IOStatus s; + + result->reset(); + + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_DELETE, // In RocksDb log files are + // renamed and deleted before + // they are closed. This enables + // doing so. + NULL, + CREATE_ALWAYS, // Original fopen mode is "w" + FILE_ATTRIBUTE_NORMAL, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError); + } else { + { + // With log files we want to set the true creation time as of now + // because the system + // for some reason caches the attributes of the previous file that just + // been renamed from + // this name so auto_roll_logger_test fails + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + // Set creation, last access and last write time to the same value + SetFileTime(hFile, &ft, &ft, &ft); + } + result->reset(new WinLogger(&WinEnvThreads::gettid, clock_.get(), hFile)); + } + return s; +} + +IOStatus WinFileSystem::IsDirectory(const std::string& path, + const IOOptions& /*opts*/, bool* is_dir, + IODebugContext* /*dbg*/) { + BOOL ret = RX_PathIsDirectory(RX_FN(path).c_str()); + if (is_dir) { + *is_dir = ret ? true : false; + } + return IOStatus::OK(); +} + +Status WinEnvIO::GetHostName(char* name, uint64_t len) { + Status s; + DWORD nSize = static_cast<DWORD>( + std::min<uint64_t>(len, std::numeric_limits<DWORD>::max())); + + if (!::GetComputerNameA(name, &nSize)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("GetHostName", lastError); + } else { + name[nSize] = 0; + } + + return s; +} + +IOStatus WinFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* dbg) { + // Check if we already have an absolute path + // For test compatibility we will consider starting slash as an + // absolute path + if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) || + !RX_PathIsRelative(RX_FN(db_path).c_str())) { + *output_path = db_path; + return IOStatus::OK(); + } + + RX_FILESTRING result; + result.resize(MAX_PATH); + + // Hopefully no changes the current directory while we do this + // however _getcwd also suffers from the same limitation + DWORD len = RX_GetCurrentDirectory(MAX_PATH, &result[0]); + if (len == 0) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError("Failed to get current working directory", + lastError); + } + + result.resize(len); + std::string res = FN_TO_RX(result); + + res.swap(*output_path); + return IOStatus::OK(); +} + +IOStatus WinFileSystem::GetFreeSpace(const std::string& path, + const IOOptions& /*options*/, + uint64_t* diskfree, + IODebugContext* /*dbg*/) { + assert(diskfree != nullptr); + ULARGE_INTEGER freeBytes; + BOOL f = RX_GetDiskFreeSpaceEx(RX_FN(path).c_str(), &freeBytes, NULL, NULL); + if (f) { + *diskfree = freeBytes.QuadPart; + return IOStatus::OK(); + } else { + DWORD lastError = GetLastError(); + return IOErrorFromWindowsError("Failed to get free space: " + path, + lastError); + } +} + +FileOptions WinFileSystem::OptimizeForLogWrite( + const FileOptions& file_options, const DBOptions& db_options) const { + FileOptions optimized(file_options); + // These two the same as default optimizations + optimized.bytes_per_sync = db_options.wal_bytes_per_sync; + optimized.writable_file_max_buffer_size = + db_options.writable_file_max_buffer_size; + + // This adversely affects %999 on windows + optimized.use_mmap_writes = false; + // Direct writes will produce a huge perf impact on + // Windows. Pre-allocate space for WAL. + optimized.use_direct_writes = false; + return optimized; +} + +FileOptions WinFileSystem::OptimizeForManifestWrite( + const FileOptions& options) const { + FileOptions optimized(options); + optimized.use_mmap_writes = false; + optimized.use_direct_reads = false; + return optimized; +} + +FileOptions WinFileSystem::OptimizeForManifestRead( + const FileOptions& file_options) const { + FileOptions optimized(file_options); + optimized.use_mmap_writes = false; + optimized.use_direct_reads = false; + return optimized; +} + +// Returns true iff the named directory exists and is a directory. +bool WinFileSystem::DirExists(const std::string& dname) { + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), GetFileExInfoStandard, + &attrs)) { + return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY); + } + return false; +} + +size_t WinFileSystem::GetSectorSize(const std::string& fname) { + size_t sector_size = kSectorSize; + + // obtain device handle + char devicename[7] = "\\\\.\\"; + int erresult = 0; + if (RX_PathIsRelative(RX_FN(fname).c_str())) { + RX_FILESTRING rx_current_dir; + rx_current_dir.resize(MAX_PATH); + DWORD len = RX_GetCurrentDirectory(MAX_PATH, &rx_current_dir[0]); + if (len == 0) { + return sector_size; + } + rx_current_dir.resize(len); + std::string current_dir = FN_TO_RX(rx_current_dir); + erresult = + strncat_s(devicename, sizeof(devicename), current_dir.c_str(), 2); + } else { + erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2); + } + + if (erresult) { + assert(false); + return sector_size; + } + + HANDLE hDevice = CreateFile(devicename, 0, 0, nullptr, OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, nullptr); + + if (hDevice == INVALID_HANDLE_VALUE) { + return sector_size; + } + + STORAGE_PROPERTY_QUERY spropertyquery; + spropertyquery.PropertyId = StorageAccessAlignmentProperty; + spropertyquery.QueryType = PropertyStandardQuery; + + BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)]; + DWORD output_bytes = 0; + + BOOL ret = DeviceIoControl( + hDevice, IOCTL_STORAGE_QUERY_PROPERTY, &spropertyquery, + sizeof(spropertyquery), output_buffer, + sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), &output_bytes, nullptr); + + if (ret) { + sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR*)output_buffer) + ->BytesPerLogicalSector; + } else { + // many devices do not support StorageProcessAlignmentProperty. Any failure + // here and we fall back to logical alignment + + DISK_GEOMETRY_EX geometry = {0}; + ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, nullptr, 0, + &geometry, sizeof(geometry), &output_bytes, nullptr); + if (ret) { + sector_size = geometry.Geometry.BytesPerSector; + } + } + + if (hDevice != INVALID_HANDLE_VALUE) { + CloseHandle(hDevice); + } + + return sector_size; +} + +//////////////////////////////////////////////////////////////////////// +// WinEnvThreads + +WinEnvThreads::WinEnvThreads(Env* hosted_env) + : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) { + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].SetThreadPriority( + static_cast<Env::Priority>(pool_id)); + // This allows later initializing the thread-local-env of each thread. + thread_pools_[pool_id].SetHostEnv(hosted_env); + } +} + +WinEnvThreads::~WinEnvThreads() { + WaitForJoin(); + + for (auto& thpool : thread_pools_) { + thpool.JoinAllThreads(); + } +} + +void WinEnvThreads::Schedule(void (*function)(void*), void* arg, + Env::Priority pri, void* tag, + void (*unschedFunction)(void* arg)) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); +} + +int WinEnvThreads::UnSchedule(void* arg, Env::Priority pri) { + return thread_pools_[pri].UnSchedule(arg); +} + +namespace { + +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; + +void* StartThreadWrapper(void* arg) { + std::unique_ptr<StartThreadState> state( + reinterpret_cast<StartThreadState*>(arg)); + state->user_function(state->arg); + return nullptr; +} + +} // namespace + +void WinEnvThreads::StartThread(void (*function)(void* arg), void* arg) { + std::unique_ptr<StartThreadState> state(new StartThreadState); + state->user_function = function; + state->arg = arg; + try { + Thread th(&StartThreadWrapper, state.get()); + state.release(); + + std::lock_guard<std::mutex> lg(mu_); + threads_to_join_.push_back(std::move(th)); + + } catch (const std::system_error& ex) { + WinthreadCall("start thread", ex.code()); + } +} + +void WinEnvThreads::WaitForJoin() { + for (auto& th : threads_to_join_) { + th.join(); + } + threads_to_join_.clear(); +} + +unsigned int WinEnvThreads::GetThreadPoolQueueLen(Env::Priority pri) const { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].GetQueueLen(); +} + +int WinEnvThreads::ReserveThreads(int threads_to_reserved, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].ReserveThreads(threads_to_reserved); +} + +int WinEnvThreads::ReleaseThreads(int threads_to_released, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].ReleaseThreads(threads_to_released); +} + +uint64_t WinEnvThreads::gettid() { + uint64_t thread_id = GetCurrentThreadId(); + return thread_id; +} + +uint64_t WinEnvThreads::GetThreadID() const { return gettid(); } + +void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); +} + +int WinEnvThreads::GetBackgroundThreads(Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].GetBackgroundThreads(); +} + +void WinEnvThreads::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); +} + +///////////////////////////////////////////////////////////////////////// +// WinEnv + +WinEnv::WinEnv() + : CompositeEnv(WinFileSystem::Default(), WinClock::Default()), + winenv_io_(this), + winenv_threads_(this) { + // Protected member of the base class + thread_status_updater_ = CreateThreadStatusUpdater(); +} + +WinEnv::~WinEnv() { + // All threads must be joined before the deletion of + // thread_status_updater_. + delete thread_status_updater_; +} + +Status WinEnv::GetThreadList(std::vector<ThreadStatus>* thread_list) { + assert(thread_status_updater_); + return thread_status_updater_->GetThreadList(thread_list); +} + +Status WinEnv::GetHostName(char* name, uint64_t len) { + return winenv_io_.GetHostName(name, len); +} + +void WinEnv::Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)) { + return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction); +} + +int WinEnv::UnSchedule(void* arg, Env::Priority pri) { + return winenv_threads_.UnSchedule(arg, pri); +} + +void WinEnv::StartThread(void (*function)(void* arg), void* arg) { + return winenv_threads_.StartThread(function, arg); +} + +void WinEnv::WaitForJoin() { return winenv_threads_.WaitForJoin(); } + +unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const { + return winenv_threads_.GetThreadPoolQueueLen(pri); +} +int WinEnv::ReserveThreads(int threads_to_reserved, Env::Priority pri) { + return winenv_threads_.ReserveThreads(threads_to_reserved, pri); +} + +int WinEnv::ReleaseThreads(int threads_to_released, Env::Priority pri) { + return winenv_threads_.ReleaseThreads(threads_to_released, pri); +} + +uint64_t WinEnv::GetThreadID() const { return winenv_threads_.GetThreadID(); } + +// Allow increasing the number of worker threads. +void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) { + return winenv_threads_.SetBackgroundThreads(num, pri); +} + +int WinEnv::GetBackgroundThreads(Env::Priority pri) { + return winenv_threads_.GetBackgroundThreads(pri); +} + +void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { + return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri); +} + +} // namespace port + +std::shared_ptr<FileSystem> FileSystem::Default() { + return port::WinFileSystem::Default(); +} + +const std::shared_ptr<SystemClock>& SystemClock::Default() { + STATIC_AVOID_DESTRUCTION(std::shared_ptr<SystemClock>, clock) + (std::make_shared<port::WinClock>()); + return clock; +} +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/port/win/env_win.h b/src/rocksdb/port/win/env_win.h new file mode 100644 index 000000000..8fbfb8246 --- /dev/null +++ b/src/rocksdb/port/win/env_win.h @@ -0,0 +1,304 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#pragma once +#include <stdint.h> +#include <windows.h> + +#include <mutex> +#include <string> +#include <vector> + +#include "env/composite_env_wrapper.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "util/threadpool_imp.h" + +#undef GetCurrentTime +#undef DeleteFile +#undef LoadLibrary + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// Currently not designed for inheritance but rather a replacement +class WinEnvThreads { + public: + explicit WinEnvThreads(Env* hosted_env); + + ~WinEnvThreads(); + + WinEnvThreads(const WinEnvThreads&) = delete; + WinEnvThreads& operator=(const WinEnvThreads&) = delete; + + void Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)); + + int UnSchedule(void* arg, Env::Priority pri); + + void StartThread(void (*function)(void* arg), void* arg); + + void WaitForJoin(); + + unsigned int GetThreadPoolQueueLen(Env::Priority pri) const; + + int ReserveThreads(int threads_to_be_reserved, Env::Priority pri); + + int ReleaseThreads(int threads_to_be_released, Env::Priority pri); + + static uint64_t gettid(); + + uint64_t GetThreadID() const; + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Env::Priority pri); + int GetBackgroundThreads(Env::Priority pri); + + void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri); + + private: + Env* hosted_env_; + mutable std::mutex mu_; + std::vector<ThreadPoolImpl> thread_pools_; + std::vector<Thread> threads_to_join_; +}; + +class WinClock : public SystemClock { + public: + WinClock(); + virtual ~WinClock() {} + + static const char* kClassName() { return "WindowsClock"; } + const char* Name() const override { return kDefaultName(); } + const char* NickName() const override { return kClassName(); } + + uint64_t NowMicros() override; + + uint64_t NowNanos() override; + + // 0 indicates not supported + uint64_t CPUMicros() override { return 0; } + void SleepForMicroseconds(int micros) override; + + Status GetCurrentTime(int64_t* unix_time) override; + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time); + + uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; } + + private: + using FnGetSystemTimePreciseAsFileTime = VOID(WINAPI*)(LPFILETIME); + + uint64_t perf_counter_frequency_; + uint64_t nano_seconds_per_period_; + FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_; +}; + +class WinFileSystem : public FileSystem { + public: + static const std::shared_ptr<WinFileSystem>& Default(); + WinFileSystem(const std::shared_ptr<SystemClock>& clock); + ~WinFileSystem() {} + static const char* kClassName() { return "WinFS"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const { return kDefaultName(); } + + static size_t GetSectorSize(const std::string& fname); + size_t GetPageSize() const { return page_size_; } + size_t GetAllocationGranularity() const { return allocation_granularity_; } + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + // Truncate the named file to the specified size. + IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSSequentialFile>* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSRandomAccessFile>* result, + IODebugContext* /*dbg*/) override; + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr<FSWritableFile>* r, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr<FSRandomRWFile>* result, + IODebugContext* dbg) override; + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr<MemoryMappedFileBuffer>* result) override; + + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr<FSDirectory>* result, + IODebugContext* dbg) override; + IOStatus FileExists(const std::string& f, const IOOptions& io_opts, + IODebugContext* dbg) override; + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector<std::string>* r, + IODebugContext* dbg) override; + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + // Delete the specified directory. + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + // Store the size of fname in *file_size. + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + // Store the last modification time of fname in *file_mtime. + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + // Rename file src to target. + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, IODebugContext* dbg) override; + + // Hard Link file src to target. + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus NumFileLinks(const std::string& /*fname*/, + const IOOptions& /*options*/, uint64_t* /*count*/, + IODebugContext* /*dbg*/) override; + IOStatus AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, + const IOOptions& /*options*/, bool* /*res*/, + IODebugContext* /*dbg*/) override; + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can override to provide custom + // logger. + IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr<Logger>* result, + IODebugContext* dbg) override; + // Get full directory name for this db. + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override; + IOStatus IsDirectory(const std::string& /*path*/, const IOOptions& options, + bool* is_dir, IODebugContext* /*dgb*/) override; + // This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + IOStatus GetFreeSpace(const std::string& /*path*/, + const IOOptions& /*options*/, uint64_t* /*diskfree*/, + IODebugContext* /*dbg*/) override; + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override; + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override; + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override; + + protected: + static uint64_t FileTimeToUnixTime(const FILETIME& ftTime); + // Returns true iff the named directory exists and is a directory. + + virtual bool DirExists(const std::string& dname); + // Helper for NewWritable and ReopenWritableFile + virtual IOStatus OpenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr<FSWritableFile>* result, + bool reopen); + + private: + std::shared_ptr<SystemClock> clock_; + size_t page_size_; + size_t allocation_granularity_; +}; + +// Designed for inheritance so can be re-used +// but certain parts replaced +class WinEnvIO { + public: + explicit WinEnvIO(Env* hosted_env); + + virtual ~WinEnvIO(); + + virtual Status GetHostName(char* name, uint64_t len); + + private: + Env* hosted_env_; +}; + +class WinEnv : public CompositeEnv { + public: + WinEnv(); + + ~WinEnv(); + static const char* kClassName() { return "WinEnv"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } + + Status GetHostName(char* name, uint64_t len) override; + + Status GetThreadList(std::vector<ThreadStatus>* thread_list) override; + + void Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)) override; + + int UnSchedule(void* arg, Env::Priority pri) override; + + void StartThread(void (*function)(void* arg), void* arg) override; + + void WaitForJoin() override; + + unsigned int GetThreadPoolQueueLen(Env::Priority pri) const override; + + int ReserveThreads(int threads_to_be_reserved, Env::Priority pri) override; + + int ReleaseThreads(int threads_to_be_released, Env::Priority pri) override; + + uint64_t GetThreadID() const override; + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Env::Priority pri) override; + int GetBackgroundThreads(Env::Priority pri) override; + + void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override; + + private: + WinEnvIO winenv_io_; + WinEnvThreads winenv_threads_; +}; + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/io_win.cc b/src/rocksdb/port/win/io_win.cc new file mode 100644 index 000000000..4fa735518 --- /dev/null +++ b/src/rocksdb/port/win/io_win.cc @@ -0,0 +1,1101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include "port/win/io_win.h" + +#include "env_win.h" +#include "monitoring/iostats_context_imp.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +/* + * DirectIOHelper + */ +namespace { + +const size_t kSectorSize = 512; + +inline bool IsPowerOfTwo(const size_t alignment) { + return ((alignment) & (alignment - 1)) == 0; +} + +inline bool IsAligned(size_t alignment, const void* ptr) { + return ((uintptr_t(ptr)) & (alignment - 1)) == 0; +} +} // namespace + +std::string GetWindowsErrSz(DWORD err) { + std::string Err; + LPSTR lpMsgBuf = nullptr; + FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, + 0, // Default language + reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL); + + if (lpMsgBuf) { + Err = lpMsgBuf; + LocalFree(lpMsgBuf); + } + return Err; +} + +// We preserve the original name of this interface to denote the original idea +// behind it. +// All reads happen by a specified offset and pwrite interface does not change +// the position of the file pointer. Judging from the man page and errno it does +// execute +// lseek atomically to return the position of the file back where it was. +// WriteFile() does not +// have this capability. Therefore, for both pread and pwrite the pointer is +// advanced to the next position +// which is fine for writes because they are (should be) sequential. +// Because all the reads/writes happen by the specified offset, the caller in +// theory should not +// rely on the current file offset. +IOStatus pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written) { + IOStatus s; + bytes_written = 0; + + size_t num_bytes = data.size(); + if (num_bytes > std::numeric_limits<DWORD>::max()) { + // May happen in 64-bit builds where size_t is 64-bits but + // long is still 32-bit, but that's the API here at the moment + return IOStatus::InvalidArgument( + "num_bytes is too large for a single write: " + file_data->GetName()); + } + + OVERLAPPED overlapped = {0}; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + DWORD bytesWritten = 0; + + if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), + static_cast<DWORD>(num_bytes), &bytesWritten, + &overlapped)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(), + lastError); + } else { + bytes_written = bytesWritten; + } + + return s; +} + +// See comments for pwrite above +IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read) { + IOStatus s; + bytes_read = 0; + + if (num_bytes > std::numeric_limits<DWORD>::max()) { + return IOStatus::InvalidArgument( + "num_bytes is too large for a single read: " + file_data->GetName()); + } + + OVERLAPPED overlapped = {0}; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + DWORD bytesRead = 0; + + if (FALSE == ReadFile(file_data->GetFileHandle(), src, + static_cast<DWORD>(num_bytes), &bytesRead, + &overlapped)) { + auto lastError = GetLastError(); + // EOF is OK with zero bytes read + if (lastError != ERROR_HANDLE_EOF) { + s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(), + lastError); + } + } else { + bytes_read = bytesRead; + } + + return s; +} + +// SetFileInformationByHandle() is capable of fast pre-allocates. +// However, this does not change the file end position unless the file is +// truncated and the pre-allocated space is not considered filled with zeros. +IOStatus fallocate(const std::string& filename, HANDLE hFile, + uint64_t to_size) { + IOStatus status; + + FILE_ALLOCATION_INFO alloc_info; + alloc_info.AllocationSize.QuadPart = to_size; + + if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info, + sizeof(FILE_ALLOCATION_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError( + "Failed to pre-allocate space: " + filename, lastError); + } + + return status; +} + +IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) { + IOStatus status; + + FILE_END_OF_FILE_INFO end_of_file; + end_of_file.EndOfFile.QuadPart = toSize; + + if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, + sizeof(FILE_END_OF_FILE_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError("Failed to Set end of file: " + filename, + lastError); + } + + return status; +} + +size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/, + size_t /*max_size*/) { + // Returning 0 is safe as it causes the table reader to generate a unique ID. + // This is suboptimal for performance as it prevents multiple table readers + // for the same file from sharing cached blocks. For example, if users have + // a low value for `max_open_files`, there can be many table readers opened + // for the same file. + // + // TODO: this is a temporarily solution as it is safe but not optimal for + // performance. For more details see discussion in + // https://github.com/facebook/rocksdb/pull/5844. + return 0; +} + +WinFileData::WinFileData(const std::string& filename, HANDLE hFile, + bool direct_io) + : filename_(filename), + hFile_(hFile), + use_direct_io_(direct_io), + sector_size_(WinFileSystem::GetSectorSize(filename)) {} + +bool WinFileData::IsSectorAligned(const size_t off) const { + return (off & (sector_size_ - 1)) == 0; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// WinMmapReadableFile + +WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName, + HANDLE hFile, HANDLE hMap, + const void* mapped_region, + size_t length) + : WinFileData(fileName, hFile, false /* use_direct_io */), + hMap_(hMap), + mapped_region_(mapped_region), + length_(length) {} + +WinMmapReadableFile::~WinMmapReadableFile() { + BOOL ret __attribute__((__unused__)); + ret = ::UnmapViewOfFile(mapped_region_); + assert(ret); + + ret = ::CloseHandle(hMap_); + assert(ret); +} + +IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { + IOStatus s; + + if (offset > length_) { + *result = Slice(); + return IOError(filename_, EINVAL); + } else if (offset + n > length_) { + n = length_ - static_cast<size_t>(offset); + } + *result = Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n); + return s; +} + +IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); +} + +size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +/////////////////////////////////////////////////////////////////////////////// +/// WinMmapFile + +// Can only truncate or reserve to a sector size aligned if +// used on files that are opened with Unbuffered I/O +IOStatus WinMmapFile::TruncateFile(uint64_t toSize) { + return ftruncate(filename_, hFile_, toSize); +} + +IOStatus WinMmapFile::UnmapCurrentRegion() { + IOStatus status; + + if (mapped_begin_ != nullptr) { + if (!::UnmapViewOfFile(mapped_begin_)) { + status = IOErrorFromWindowsError( + "Failed to unmap file view: " + filename_, GetLastError()); + } + + // Move on to the next portion of the file + file_offset_ += view_size_; + + // UnmapView automatically sends data to disk but not the metadata + // which is good and provides some equivalent of fdatasync() on Linux + // therefore, we donot need separate flag for metadata + mapped_begin_ = nullptr; + mapped_end_ = nullptr; + dst_ = nullptr; + + last_sync_ = nullptr; + pending_sync_ = false; + } + + return status; +} + +IOStatus WinMmapFile::MapNewRegion(const IOOptions& options, + IODebugContext* dbg) { + IOStatus status; + + assert(mapped_begin_ == nullptr); + + size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_; + + if (minDiskSize > reserved_size_) { + status = Allocate(file_offset_, view_size_, options, dbg); + if (!status.ok()) { + return status; + } + } + + // Need to remap + if (hMap_ == NULL || reserved_size_ > mapping_size_) { + if (hMap_ != NULL) { + // Unmap the previous one + BOOL ret __attribute__((__unused__)); + ret = ::CloseHandle(hMap_); + assert(ret); + hMap_ = NULL; + } + + ULARGE_INTEGER mappingSize; + mappingSize.QuadPart = reserved_size_; + + hMap_ = CreateFileMappingA( + hFile_, + NULL, // Security attributes + PAGE_READWRITE, // There is not a write only mode for mapping + mappingSize.HighPart, // Enable mapping the whole file but the actual + // amount mapped is determined by MapViewOfFile + mappingSize.LowPart, + NULL); // Mapping name + + if (NULL == hMap_) { + return IOErrorFromWindowsError( + "WindowsMmapFile failed to create file mapping for: " + filename_, + GetLastError()); + } + + mapping_size_ = reserved_size_; + } + + ULARGE_INTEGER offset; + offset.QuadPart = file_offset_; + + // View must begin at the granularity aligned offset + mapped_begin_ = reinterpret_cast<char*>( + MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, + view_size_, NULL)); + + if (!mapped_begin_) { + status = IOErrorFromWindowsError( + "WindowsMmapFile failed to map file view: " + filename_, + GetLastError()); + } else { + mapped_end_ = mapped_begin_ + view_size_; + dst_ = mapped_begin_; + last_sync_ = mapped_begin_; + pending_sync_ = false; + } + return status; +} + +IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(filename_, hFile_, spaceToReserve); +} + +WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, + size_t page_size, size_t allocation_granularity, + const FileOptions& options) + : WinFileData(fname, hFile, false), + FSWritableFile(options), + hMap_(NULL), + page_size_(page_size), + allocation_granularity_(allocation_granularity), + reserved_size_(0), + mapping_size_(0), + view_size_(0), + mapped_begin_(nullptr), + mapped_end_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0), + pending_sync_(false) { + // Allocation granularity must be obtained from GetSystemInfo() and must be + // a power of two. + assert(allocation_granularity > 0); + assert((allocation_granularity & (allocation_granularity - 1)) == 0); + + assert(page_size > 0); + assert((page_size & (page_size - 1)) == 0); + + // Only for memory mapped writes + assert(options.use_mmap_writes); + + // View size must be both the multiple of allocation_granularity AND the + // page size and the granularity is usually a multiple of a page size. + const size_t viewSize = + 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode + view_size_ = Roundup(viewSize, allocation_granularity_); +} + +WinMmapFile::~WinMmapFile() { + if (hFile_) { + this->Close(IOOptions(), nullptr); + } +} + +IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) { + const char* src = data.data(); + size_t left = data.size(); + + while (left > 0) { + assert(mapped_begin_ <= dst_); + size_t avail = mapped_end_ - dst_; + + if (avail == 0) { + IOStatus s = UnmapCurrentRegion(); + if (s.ok()) { + s = MapNewRegion(options, dbg); + } + + if (!s.ok()) { + return s; + } + } else { + size_t n = std::min(left, avail); + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + pending_sync_ = true; + } + } + + // Now make sure that the last partial page is padded with zeros if needed + size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_); + if (bytesToPad > 0) { + memset(dst_, 0, bytesToPad); + } + + return IOStatus::OK(); +} + +// Means Close() will properly take care of truncate +// and it does not need any additional information +IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) { + IOStatus s; + + assert(NULL != hFile_); + + // We truncate to the precise size so no + // uninitialized data at the end. SetEndOfFile + // which we use does not write zeros and it is good. + uint64_t targetSize = GetFileSize(options, dbg); + + if (mapped_begin_ != nullptr) { + // Sync before unmapping to make sure everything + // is on disk and there is not a lazy writing + // so we are deterministic with the tests + Sync(options, dbg); + s = UnmapCurrentRegion(); + } + + if (NULL != hMap_) { + BOOL ret = ::CloseHandle(hMap_); + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to Close mapping for file: " + filename_, lastError); + } + + hMap_ = NULL; + } + + if (hFile_ != NULL) { + TruncateFile(targetSize); + + BOOL ret = ::CloseHandle(hFile_); + hFile_ = NULL; + + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to close file map handle: " + filename_, lastError); + } + } + + return s; +} + +IOStatus WinMmapFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +// Flush only data +IOStatus WinMmapFile::Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + + // Some writes occurred since last sync + if (dst_ > last_sync_) { + assert(mapped_begin_); + assert(dst_); + assert(dst_ > mapped_begin_); + assert(dst_ < mapped_end_); + + size_t page_begin = + TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); + size_t page_end = + TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); + + // Flush only the amount of that is a multiple of pages + if (!::FlushViewOfFile(mapped_begin_ + page_begin, + (page_end - page_begin) + page_size_)) { + s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, + GetLastError()); + } else { + last_sync_ = dst_; + } + } + + return s; +} + +/** + * Flush data as well as metadata to stable storage. + */ +IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) { + IOStatus s = Sync(options, dbg); + + // Flush metadata + if (s.ok() && pending_sync_) { + if (!::FlushFileBuffers(hFile_)) { + s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, + GetLastError()); + } + pending_sync_ = false; + } + + return s; +} + +/** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ +uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + size_t used = dst_ - mapped_begin_; + return file_offset_ + used; +} + +IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); +} + +IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus status; + TEST_KILL_RANDOM("WinMmapFile::Allocate"); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = + Roundup(static_cast<size_t>(offset + len), view_size_); + // Nothing to do + if (spaceToReserve <= reserved_size_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reserved_size_ = spaceToReserve; + } + return status; +} + +size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +////////////////////////////////////////////////////////////////////////////////// +// WinSequentialFile + +WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f, + const FileOptions& options) + : WinFileData(fname, f, options.use_direct_reads) {} + +WinSequentialFile::~WinSequentialFile() { + assert(hFile_ != INVALID_HANDLE_VALUE); +} + +IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + IOStatus s; + size_t r = 0; + + assert(result != nullptr); + if (WinFileData::use_direct_io()) { + return IOStatus::NotSupported("Read() does not support direct_io"); + } + + // Windows ReadFile API accepts a DWORD. + // While it is possible to read in a loop if n is too big + // it is an unlikely case. + if (n > std::numeric_limits<DWORD>::max()) { + return IOStatus::InvalidArgument("n is too big for a single ReadFile: " + + filename_); + } + + DWORD bytesToRead = + static_cast<DWORD>(n); // cast is safe due to the check above + DWORD bytesRead = 0; + BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL); + if (ret != FALSE) { + r = bytesRead; + } else { + auto lastError = GetLastError(); + if (lastError != ERROR_HANDLE_EOF) { + s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError); + } + } + + *result = Slice(scratch, r); + return s; +} + +IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const { + return pread(this, src, numBytes, offset, bytes_read); +} + +IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + if (!WinFileData::use_direct_io()) { + return IOStatus::NotSupported("This function is only used for direct_io"); + } + + assert(IsSectorAligned(static_cast<size_t>(offset))); + assert(IsSectorAligned(static_cast<size_t>(n))); + + size_t bytes_read = 0; // out param + IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset, + bytes_read); + *result = Slice(scratch, bytes_read); + return s; +} + +IOStatus WinSequentialFile::Skip(uint64_t n) { + // Can't handle more than signed max as SetFilePointerEx accepts a signed + // 64-bit integer. As such it is a highly unlikley case to have n so large. + if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) { + return IOStatus::InvalidArgument( + "n is too large for a single SetFilePointerEx() call" + filename_); + } + + LARGE_INTEGER li; + li.QuadPart = static_cast<LONGLONG>(n); // cast is safe due to the check + // above + BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT); + if (ret == FALSE) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_, + lastError); + } + return IOStatus::OK(); +} + +IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); +} + +////////////////////////////////////////////////////////////////////////////////////////////////// +/// WinRandomAccessBase + +inline IOStatus WinRandomAccessImpl::PositionedReadInternal( + char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const { + return pread(file_base_, src, numBytes, offset, bytes_read); +} + +inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base, + size_t alignment, + const FileOptions& options) + : file_base_(file_base), + alignment_(std::max(alignment, file_base->GetSectorSize())) { + assert(!options.use_mmap_reads); +} + +inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, + Slice* result, + char* scratch) const { + // Check buffer alignment + if (file_base_->use_direct_io()) { + assert(file_base_->IsSectorAligned(static_cast<size_t>(offset))); + assert(IsAligned(alignment_, scratch)); + } + + if (n == 0) { + *result = Slice(scratch, 0); + return IOStatus::OK(); + } + + size_t bytes_read = 0; + IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read); + *result = Slice(scratch, bytes_read); + return s; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/// WinRandomAccessFile + +WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile, + size_t alignment, + const FileOptions& options) + : WinFileData(fname, hFile, options.use_direct_reads), + WinRandomAccessImpl(this, alignment, options) {} + +WinRandomAccessFile::~WinRandomAccessFile() {} + +IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { + return ReadImpl(offset, n, result, scratch); +} + +IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); +} + +size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(GetFileHandle(), id, max_size); +} + +size_t WinRandomAccessFile::GetRequiredBufferAlignment() const { + return GetAlignment(); +} + +///////////////////////////////////////////////////////////////////////////// +// WinWritableImpl +// + +inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), + spaceToReserve); +} + +inline WinWritableImpl::WinWritableImpl(WinFileData* file_data, + size_t alignment) + : file_data_(file_data), + alignment_(std::max(alignment, file_data->GetSectorSize())), + next_write_offset_(0), + reservedsize_(0) { + // Query current position in case ReopenWritableFile is called + // This position is only important for buffered writes + // for unbuffered writes we explicitely specify the position. + LARGE_INTEGER zero_move; + zero_move.QuadPart = 0; // Do not move + LARGE_INTEGER pos; + pos.QuadPart = 0; + BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos, + FILE_CURRENT); + // Querying no supped to fail + if (ret != 0) { + next_write_offset_ = pos.QuadPart; + } else { + assert(false); + } +} + +inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) { + IOStatus s; + + if (data.size() > std::numeric_limits<DWORD>::max()) { + return IOStatus::InvalidArgument("data is too long for a single write" + + file_data_->GetName()); + } + + size_t bytes_written = 0; // out param + + if (file_data_->use_direct_io()) { + // With no offset specified we are appending + // to the end of the file + assert(file_data_->IsSectorAligned(next_write_offset_)); + assert(file_data_->IsSectorAligned(data.size())); + assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data())); + s = pwrite(file_data_, data, next_write_offset_, bytes_written); + } else { + DWORD bytesWritten = 0; + if (!WriteFile(file_data_->GetFileHandle(), data.data(), + static_cast<DWORD>(data.size()), &bytesWritten, NULL)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to WriteFile: " + file_data_->GetName(), lastError); + } else { + bytes_written = bytesWritten; + } + } + + if (s.ok()) { + if (bytes_written == data.size()) { + // This matters for direct_io cases where + // we rely on the fact that next_write_offset_ + // is sector aligned + next_write_offset_ += bytes_written; + } else { + s = IOStatus::IOError("Failed to write all bytes: " + + file_data_->GetName()); + } + } + + return s; +} + +inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data, + uint64_t offset) { + if (file_data_->use_direct_io()) { + assert(file_data_->IsSectorAligned(static_cast<size_t>(offset))); + assert(file_data_->IsSectorAligned(data.size())); + assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data())); + } + + size_t bytes_written = 0; + IOStatus s = pwrite(file_data_, data, offset, bytes_written); + + if (s.ok()) { + if (bytes_written == data.size()) { + // For sequential write this would be simple + // size extension by data.size() + uint64_t write_end = offset + bytes_written; + if (write_end >= next_write_offset_) { + next_write_offset_ = write_end; + } + } else { + s = IOStatus::IOError("Failed to write all of the requested data: " + + file_data_->GetName()); + } + } + return s; +} + +inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) { + // It is tempting to check for the size for sector alignment + // but truncation may come at the end and there is not a requirement + // for this to be sector aligned so long as we do not attempt to write + // after that. The interface docs state that the behavior is undefined + // in that case. + IOStatus s = + ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size); + + if (s.ok()) { + next_write_offset_ = size; + } + return s; +} + +inline IOStatus WinWritableImpl::CloseImpl() { + IOStatus s; + + auto hFile = file_data_->GetFileHandle(); + assert(INVALID_HANDLE_VALUE != hFile); + + if (!::FlushFileBuffers(hFile)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "FlushFileBuffers failed at Close() for: " + file_data_->GetName(), + lastError); + } + + if (!file_data_->CloseFile() && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "CloseHandle failed for: " + file_data_->GetName(), lastError); + } + return s; +} + +inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + if (!::FlushFileBuffers(file_data_->GetFileHandle())) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), + lastError); + } + return s; +} + +inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) { + IOStatus status; + TEST_KILL_RANDOM("WinWritableFile::Allocate"); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), + static_cast<size_t>(alignment_)); + // Nothing to do + if (spaceToReserve <= reservedsize_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reservedsize_ = spaceToReserve; + } + return status; +} + +//////////////////////////////////////////////////////////////////////////////// +/// WinWritableFile + +WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile, + size_t alignment, size_t /* capacity */, + const FileOptions& options) + : WinFileData(fname, hFile, options.use_direct_writes), + WinWritableImpl(this, alignment), + FSWritableFile(options) { + assert(!options.use_mmap_writes); +} + +WinWritableFile::~WinWritableFile() {} + +// Indicates if the class makes use of direct I/O +bool WinWritableFile::use_direct_io() const { + return WinFileData::use_direct_io(); +} + +size_t WinWritableFile::GetRequiredBufferAlignment() const { + return static_cast<size_t>(GetAlignment()); +} + +IOStatus WinWritableFile::Append(const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return AppendImpl(data); +} + +IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return PositionedAppendImpl(data, offset); +} + +// Need to implement this so the file is truncated correctly +// when buffered and unbuffered mode +IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return TruncateImpl(size); +} + +IOStatus WinWritableFile::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return CloseImpl(); +} + +// write out the cached data to the OS cache +// This is now taken care of the WritableFileWriter +IOStatus WinWritableFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); +} + +IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); +} + +bool WinWritableFile::IsSyncThreadSafe() const { return true; } + +uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return GetFileNextWriteOffset(); +} + +IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return AllocateImpl(offset, len); +} + +size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(GetFileHandle(), id, max_size); +} + +///////////////////////////////////////////////////////////////////////// +/// WinRandomRWFile + +WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile, + size_t alignment, const FileOptions& options) + : WinFileData(fname, hFile, + options.use_direct_reads && options.use_direct_writes), + WinRandomAccessImpl(this, alignment, options), + WinWritableImpl(this, alignment) {} + +bool WinRandomRWFile::use_direct_io() const { + return WinFileData::use_direct_io(); +} + +size_t WinRandomRWFile::GetRequiredBufferAlignment() const { + assert(WinRandomAccessImpl::GetAlignment() == + WinWritableImpl::GetAlignment()); + return static_cast<size_t>(WinRandomAccessImpl::GetAlignment()); +} + +IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return PositionedAppendImpl(data, offset); +} + +IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) const { + return ReadImpl(offset, n, result, scratch); +} + +IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); +} + +IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return CloseImpl(); +} + +////////////////////////////////////////////////////////////////////////// +/// WinMemoryMappedBufer +WinMemoryMappedBuffer::~WinMemoryMappedBuffer() { + BOOL ret +#if defined(_MSC_VER) + = FALSE; +#else + __attribute__((__unused__)); +#endif + if (base_ != nullptr) { + ret = ::UnmapViewOfFile(base_); + assert(ret); + base_ = nullptr; + } + if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(map_handle_); + assert(ret); + map_handle_ = NULL; + } + if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(file_handle_); + assert(ret); + file_handle_ = NULL; + } +} + +////////////////////////////////////////////////////////////////////////// +/// WinDirectory + +IOStatus WinDirectory::Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} + +IOStatus WinDirectory::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s = IOStatus::OK(); + BOOL ret __attribute__((__unused__)); + if (handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(handle_); + if (!ret) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Directory closes failed for : " + GetName(), + lastError); + } + handle_ = NULL; + } + return s; +} + +size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(handle_, id, max_size); +} +////////////////////////////////////////////////////////////////////////// +/// WinFileLock + +WinFileLock::~WinFileLock() { + BOOL ret __attribute__((__unused__)); + ret = ::CloseHandle(hFile_); + assert(ret); +} + +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/port/win/io_win.h b/src/rocksdb/port/win/io_win.h new file mode 100644 index 000000000..a4fee8346 --- /dev/null +++ b/src/rocksdb/port/win/io_win.h @@ -0,0 +1,508 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include <stdint.h> +#include <windows.h> + +#include <mutex> +#include <string> + +#include "rocksdb/file_system.h" +#include "rocksdb/status.h" +#include "util/aligned_buffer.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +std::string GetWindowsErrSz(DWORD err); + +inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) { + return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) + ? IOStatus::NoSpace(context, GetWindowsErrSz(err)) + : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND)) + ? IOStatus::PathNotFound(context, GetWindowsErrSz(err)) + : IOStatus::IOError(context, GetWindowsErrSz(err)); +} + +inline IOStatus IOErrorFromLastWindowsError(const std::string& context) { + return IOErrorFromWindowsError(context, GetLastError()); +} + +inline IOStatus IOError(const std::string& context, int err_number) { + return (err_number == ENOSPC) + ? IOStatus::NoSpace(context, errnoStr(err_number).c_str()) + : (err_number == ENOENT) + ? IOStatus::PathNotFound(context, errnoStr(err_number).c_str()) + : IOStatus::IOError(context, errnoStr(err_number).c_str()); +} + +class WinFileData; + +IOStatus pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written); + +IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read); + +IOStatus fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size); + +IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize); + +size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size); + +class WinFileData { + protected: + const std::string filename_; + HANDLE hFile_; + // If true, the I/O issued would be direct I/O which the buffer + // will need to be aligned (not sure there is a guarantee that the buffer + // passed in is aligned). + const bool use_direct_io_; + const size_t sector_size_; + + public: + // We want this class be usable both for inheritance (prive + // or protected) and for containment so __ctor and __dtor public + WinFileData(const std::string& filename, HANDLE hFile, bool direct_io); + + virtual ~WinFileData() { this->CloseFile(); } + + bool CloseFile() { + bool result = true; + + if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) { + result = ::CloseHandle(hFile_); + assert(result); + hFile_ = NULL; + } + return result; + } + + const std::string& GetName() const { return filename_; } + + HANDLE GetFileHandle() const { return hFile_; } + + bool use_direct_io() const { return use_direct_io_; } + + size_t GetSectorSize() const { return sector_size_; } + + bool IsSectorAligned(const size_t off) const; + + WinFileData(const WinFileData&) = delete; + WinFileData& operator=(const WinFileData&) = delete; +}; + +class WinSequentialFile : protected WinFileData, public FSSequentialFile { + // Override for behavior change when creating a custom env + virtual IOStatus PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const; + + public: + WinSequentialFile(const std::string& fname, HANDLE f, + const FileOptions& options); + + ~WinSequentialFile(); + + WinSequentialFile(const WinSequentialFile&) = delete; + WinSequentialFile& operator=(const WinSequentialFile&) = delete; + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; + + IOStatus Skip(uint64_t n) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + virtual bool use_direct_io() const override { + return WinFileData::use_direct_io(); + } +}; + +// mmap() based random-access +class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile { + HANDLE hMap_; + + const void* mapped_region_; + const size_t length_; + + public: + // mapped_region_[0,length-1] contains the mmapped contents of the file. + WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap, + const void* mapped_region, size_t length); + + ~WinMmapReadableFile(); + + WinMmapReadableFile(const WinMmapReadableFile&) = delete; + WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete; + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +// We preallocate and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class WinMmapFile : private WinFileData, public FSWritableFile { + private: + HANDLE hMap_; + + const size_t page_size_; // We flush the mapping view in page_size + // increments. We may decide if this is a memory + // page size or SSD page size + const size_t + allocation_granularity_; // View must start at such a granularity + + size_t reserved_size_; // Preallocated size + + size_t mapping_size_; // The max size of the mapping object + // we want to guess the final file size to minimize the remapping + size_t view_size_; // How much memory to map into a view at a time + + char* mapped_begin_; // Must begin at the file offset that is aligned with + // allocation_granularity_ + char* mapped_end_; + char* dst_; // Where to write next (in range [mapped_begin_,mapped_end_]) + char* last_sync_; // Where have we synced up to + + uint64_t file_offset_; // Offset of mapped_begin_ in file + + // Do we have unsynced writes? + bool pending_sync_; + + // Can only truncate or reserve to a sector size aligned if + // used on files that are opened with Unbuffered I/O + IOStatus TruncateFile(uint64_t toSize); + + IOStatus UnmapCurrentRegion(); + + IOStatus MapNewRegion(const IOOptions& options, IODebugContext* dbg); + + virtual IOStatus PreallocateInternal(uint64_t spaceToReserve); + + public: + WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, + size_t allocation_granularity, const FileOptions& options); + + ~WinMmapFile(); + + WinMmapFile(const WinMmapFile&) = delete; + WinMmapFile& operator=(const WinMmapFile&) = delete; + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } + + // Means Close() will properly take care of truncate + // and it does not need any additional information + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + // Flush only data + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + /** + * Flush data as well as metadata to stable storage. + */ + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + /** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinRandomAccessImpl { + protected: + WinFileData* file_base_; + size_t alignment_; + + // Override for behavior change when creating a custom env + virtual IOStatus PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const; + + WinRandomAccessImpl(WinFileData* file_base, size_t alignment, + const FileOptions& options); + + virtual ~WinRandomAccessImpl() {} + + IOStatus ReadImpl(uint64_t offset, size_t n, Slice* result, + char* scratch) const; + + size_t GetAlignment() const { return alignment_; } + + public: + WinRandomAccessImpl(const WinRandomAccessImpl&) = delete; + WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete; +}; + +// pread() based random-access +class WinRandomAccessFile + : private WinFileData, + protected WinRandomAccessImpl, // Want to be able to override + // PositionedReadInternal + public FSRandomAccessFile { + public: + WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, + const FileOptions& options); + + ~WinRandomAccessFile(); + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; + + virtual bool use_direct_io() const override { + return WinFileData::use_direct_io(); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetRequiredBufferAlignment() const override; +}; + +// This is a sequential write class. It has been mimicked (as others) after +// the original Posix class. We add support for unbuffered I/O on windows as +// well +// we utilize the original buffer as an alignment buffer to write directly to +// file with no buffering. +// No buffering requires that the provided buffer is aligned to the physical +// sector size (SSD page size) and +// that all SetFilePointer() operations to occur with such an alignment. +// We thus always write in sector/page size increments to the drive and leave +// the tail for the next write OR for Close() at which point we pad with zeros. +// No padding is required for +// buffered access. +class WinWritableImpl { + protected: + WinFileData* file_data_; + const uint64_t alignment_; + uint64_t + next_write_offset_; // Needed because Windows does not support O_APPEND + uint64_t reservedsize_; // how far we have reserved space + + virtual IOStatus PreallocateInternal(uint64_t spaceToReserve); + + WinWritableImpl(WinFileData* file_data, size_t alignment); + + ~WinWritableImpl() {} + + uint64_t GetAlignment() const { return alignment_; } + + IOStatus AppendImpl(const Slice& data); + + // Requires that the data is aligned as specified by + // GetRequiredBufferAlignment() + IOStatus PositionedAppendImpl(const Slice& data, uint64_t offset); + + IOStatus TruncateImpl(uint64_t size); + + IOStatus CloseImpl(); + + IOStatus SyncImpl(const IOOptions& options, IODebugContext* dbg); + + uint64_t GetFileNextWriteOffset() { + // Double accounting now here with WritableFileWriter + // and this size will be wrong when unbuffered access is used + // but tests implement their own writable files and do not use + // WritableFileWrapper + // so we need to squeeze a square peg through + // a round hole here. + return next_write_offset_; + } + + IOStatus AllocateImpl(uint64_t offset, uint64_t len); + + public: + WinWritableImpl(const WinWritableImpl&) = delete; + WinWritableImpl& operator=(const WinWritableImpl&) = delete; +}; + +class WinWritableFile : private WinFileData, + protected WinWritableImpl, + public FSWritableFile { + public: + WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, + size_t capacity, const FileOptions& options); + + ~WinWritableFile(); + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } + + // Requires that the data is aligned as specified by + // GetRequiredBufferAlignment() + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, opts, dbg); + } + + // Need to implement this so the file is truncated correctly + // when buffered and unbuffered mode + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + // write out the cached data to the OS cache + // This is now taken care of the WritableFileWriter + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + virtual bool IsSyncThreadSafe() const override; + + // Indicates if the class makes use of direct I/O + // Use PositionedAppend + virtual bool use_direct_io() const override; + + virtual size_t GetRequiredBufferAlignment() const override; + + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinRandomRWFile : private WinFileData, + protected WinRandomAccessImpl, + protected WinWritableImpl, + public FSRandomRWFile { + public: + WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment, + const FileOptions& options); + + ~WinRandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const override; + + // Use the returned alignment value to allocate aligned + // buffer for Write() when use_direct_io() returns true + virtual size_t GetRequiredBufferAlignment() const override; + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return Sync(options, dbg); + } + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; +}; + +class WinMemoryMappedBuffer : public MemoryMappedFileBuffer { + private: + HANDLE file_handle_; + HANDLE map_handle_; + + public: + WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, + size_t size) + : MemoryMappedFileBuffer(base, size), + file_handle_(file_handle), + map_handle_(map_handle) {} + ~WinMemoryMappedBuffer() override; +}; + +class WinDirectory : public FSDirectory { + const std::string filename_; + HANDLE handle_; + + public: + explicit WinDirectory(const std::string& filename, HANDLE h) noexcept + : filename_(filename), handle_(h) { + assert(handle_ != INVALID_HANDLE_VALUE); + } + ~WinDirectory() { + if (handle_ != NULL) { + IOStatus s = WinDirectory::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); + } + } + const std::string& GetName() const { return filename_; } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinFileLock : public FileLock { + public: + explicit WinFileLock(HANDLE hFile) : hFile_(hFile) { + assert(hFile != NULL); + assert(hFile != INVALID_HANDLE_VALUE); + } + + ~WinFileLock(); + + private: + HANDLE hFile_; +}; +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/port_win.cc b/src/rocksdb/port/win/port_win.cc new file mode 100644 index 000000000..37e8f655c --- /dev/null +++ b/src/rocksdb/port/win/port_win.cc @@ -0,0 +1,303 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include "port/win/port_win.h" + +#include <assert.h> +#include <io.h> +#include <rpc.h> +#include <stdio.h> +#include <string.h> + +#include <chrono> +#include <cstdlib> +#include <exception> +#include <memory> + +#include "port/port_dirent.h" +#include "port/sys_time.h" + +#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES +// utf8 <-> utf16 +#include <codecvt> +#include <locale> +#include <string> +#endif + +#include "logging/logging.h" + +namespace ROCKSDB_NAMESPACE { + +extern const bool kDefaultToAdaptiveMutex = false; + +namespace port { + +#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES +std::string utf16_to_utf8(const std::wstring& utf16) { + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> convert; + return convert.to_bytes(utf16); +} + +std::wstring utf8_to_utf16(const std::string& utf8) { + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter; + return converter.from_bytes(utf8); +} +#endif + +void GetTimeOfDay(TimeVal* tv, struct timezone* /* tz */) { + std::chrono::microseconds usNow( + std::chrono::duration_cast<std::chrono::microseconds>( + std::chrono::system_clock::now().time_since_epoch())); + + std::chrono::seconds secNow( + std::chrono::duration_cast<std::chrono::seconds>(usNow)); + + tv->tv_sec = static_cast<long>(secNow.count()); + tv->tv_usec = static_cast<long>( + usNow.count() - + std::chrono::duration_cast<std::chrono::microseconds>(secNow).count()); +} + +Mutex::~Mutex() {} + +CondVar::~CondVar() {} + +void CondVar::Wait() { + // Caller must ensure that mutex is held prior to calling this method + std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock); +#ifndef NDEBUG + mu_->locked_ = false; +#endif + cv_.wait(lk); +#ifndef NDEBUG + mu_->locked_ = true; +#endif + // Release ownership of the lock as we don't want it to be unlocked when + // it goes out of scope (as we adopted the lock and didn't lock it ourselves) + lk.release(); +} + +bool CondVar::TimedWait(uint64_t abs_time_us) { + // MSVC++ library implements wait_until in terms of wait_for so + // we need to convert absolute wait into relative wait. + std::chrono::microseconds usAbsTime(abs_time_us); + + std::chrono::microseconds usNow( + std::chrono::duration_cast<std::chrono::microseconds>( + std::chrono::system_clock::now().time_since_epoch())); + std::chrono::microseconds relTimeUs = (usAbsTime > usNow) + ? (usAbsTime - usNow) + : std::chrono::microseconds::zero(); + + // Caller must ensure that mutex is held prior to calling this method + std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock); + + // Work around https://github.com/microsoft/STL/issues/369 +#if defined(_MSC_VER) && \ + (!defined(_MSVC_STL_UPDATE) || _MSVC_STL_UPDATE < 202008L) + if (relTimeUs == std::chrono::microseconds::zero()) { + lk.unlock(); + lk.lock(); + } +#endif +#ifndef NDEBUG + mu_->locked_ = false; +#endif + std::cv_status cvStatus = cv_.wait_for(lk, relTimeUs); +#ifndef NDEBUG + mu_->locked_ = true; +#endif + // Release ownership of the lock as we don't want it to be unlocked when + // it goes out of scope (as we adopted the lock and didn't lock it ourselves) + lk.release(); + + if (cvStatus == std::cv_status::timeout) { + return true; + } + + return false; +} + +void CondVar::Signal() { cv_.notify_one(); } + +void CondVar::SignalAll() { cv_.notify_all(); } + +int PhysicalCoreID() { return GetCurrentProcessorNumber(); } + +void InitOnce(OnceType* once, void (*initializer)()) { + std::call_once(once->flag_, initializer); +} + +// Private structure, exposed only by pointer +struct DIR { + HANDLE handle_; + bool firstread_; + RX_WIN32_FIND_DATA data_; + dirent entry_; + + DIR() : handle_(INVALID_HANDLE_VALUE), firstread_(true) {} + + DIR(const DIR&) = delete; + DIR& operator=(const DIR&) = delete; + + ~DIR() { + if (INVALID_HANDLE_VALUE != handle_) { + ::FindClose(handle_); + } + } +}; + +DIR* opendir(const char* name) { + if (!name || *name == 0) { + errno = ENOENT; + return nullptr; + } + + std::string pattern(name); + pattern.append("\\").append("*"); + + std::unique_ptr<DIR> dir(new DIR); + + dir->handle_ = + RX_FindFirstFileEx(RX_FN(pattern).c_str(), + FindExInfoBasic, // Do not want alternative name + &dir->data_, FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); + + if (dir->handle_ == INVALID_HANDLE_VALUE) { + return nullptr; + } + + RX_FILESTRING x(dir->data_.cFileName, RX_FNLEN(dir->data_.cFileName)); + strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), FN_TO_RX(x).c_str()); + + return dir.release(); +} + +struct dirent* readdir(DIR* dirp) { + if (!dirp || dirp->handle_ == INVALID_HANDLE_VALUE) { + errno = EBADF; + return nullptr; + } + + if (dirp->firstread_) { + dirp->firstread_ = false; + return &dirp->entry_; + } + + auto ret = RX_FindNextFile(dirp->handle_, &dirp->data_); + + if (ret == 0) { + return nullptr; + } + + RX_FILESTRING x(dirp->data_.cFileName, RX_FNLEN(dirp->data_.cFileName)); + strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), + FN_TO_RX(x).c_str()); + + return &dirp->entry_; +} + +int closedir(DIR* dirp) { + delete dirp; + return 0; +} + +int truncate(const char* path, int64_t length) { + if (path == nullptr) { + errno = EFAULT; + return -1; + } + return ROCKSDB_NAMESPACE::port::Truncate(path, length); +} + +int Truncate(std::string path, int64_t len) { + if (len < 0) { + errno = EINVAL; + return -1; + } + + HANDLE hFile = + RX_CreateFile(RX_FN(path).c_str(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, // Security attrs + OPEN_EXISTING, // Truncate existing file only + FILE_ATTRIBUTE_NORMAL, NULL); + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + if (lastError == ERROR_FILE_NOT_FOUND) { + errno = ENOENT; + } else if (lastError == ERROR_ACCESS_DENIED) { + errno = EACCES; + } else { + errno = EIO; + } + return -1; + } + + int result = 0; + FILE_END_OF_FILE_INFO end_of_file; + end_of_file.EndOfFile.QuadPart = len; + + if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, + sizeof(FILE_END_OF_FILE_INFO))) { + errno = EIO; + result = -1; + } + + CloseHandle(hFile); + return result; +} + +void Crash(const std::string& srcfile, int srcline) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + abort(); +} + +int GetMaxOpenFiles() { return -1; } + +// Assume 4KB page size +const size_t kPageSize = 4U * 1024U; + +void SetCpuPriority(ThreadId id, CpuPriority priority) { + // Not supported + (void)id; + (void)priority; +} + +int64_t GetProcessID() { return GetCurrentProcessId(); } + +bool GenerateRfcUuid(std::string* output) { + UUID uuid; + UuidCreateSequential(&uuid); + + RPC_CSTR rpc_str; + auto status = UuidToStringA(&uuid, &rpc_str); + if (status != RPC_S_OK) { + return false; + } + + // rpc_str is nul-terminated + *output = reinterpret_cast<char*>(rpc_str); + + status = RpcStringFreeA(&rpc_str); + assert(status == RPC_S_OK); + + return true; +} + +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/port/win/port_win.h b/src/rocksdb/port/win/port_win.h new file mode 100644 index 000000000..989b5620b --- /dev/null +++ b/src/rocksdb/port/win/port_win.h @@ -0,0 +1,378 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#pragma once + +// Always want minimum headers +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif + +#include <windows.h> +#include <string> +#include <thread> +#include <string.h> +#include <mutex> +#include <limits> +#include <condition_variable> +#include <malloc.h> +#include <intrin.h> +#include <process.h> + +#include <stdint.h> + +#include "port/win/win_thread.h" + +#include "rocksdb/options.h" + +#undef min +#undef max +#undef DeleteFile +#undef GetCurrentTime + +#ifndef strcasecmp +#define strcasecmp _stricmp +#endif + +#undef GetCurrentTime +#undef DeleteFile + +#ifndef _SSIZE_T_DEFINED +using ssize_t = SSIZE_T; +#endif + +// size_t printf formatting named in the manner of C99 standard formatting +// strings such as PRIu64 +// in fact, we could use that one +#ifndef ROCKSDB_PRIszt +#define ROCKSDB_PRIszt "Iu" +#endif + +#ifdef _MSC_VER +#define __attribute__(A) + +#endif + +namespace ROCKSDB_NAMESPACE { + +#define PREFETCH(addr, rw, locality) + +extern const bool kDefaultToAdaptiveMutex; + +namespace port { + +// "Windows is designed to run on little-endian computer architectures." +// https://docs.microsoft.com/en-us/windows/win32/sysinfo/registry-value-types +constexpr bool kLittleEndian = true; +#undef PLATFORM_IS_LITTLE_ENDIAN + +class CondVar; + +class Mutex { + public: + static const char* kName() { return "std::mutex"; } + + explicit Mutex(bool IGNORED_adaptive = kDefaultToAdaptiveMutex) +#ifndef NDEBUG + : locked_(false) +#endif + { + (void)IGNORED_adaptive; + } + + ~Mutex(); + + void Lock() { + mutex_.lock(); +#ifndef NDEBUG + locked_ = true; +#endif + } + + void Unlock() { +#ifndef NDEBUG + locked_ = false; +#endif + mutex_.unlock(); + } + + bool TryLock() { + bool ret = mutex_.try_lock(); +#ifndef NDEBUG + if (ret) { + locked_ = true; + } +#endif + return ret; + } + + // this will assert if the mutex is not locked + // it does NOT verify that mutex is held by a calling thread + void AssertHeld() { +#ifndef NDEBUG + assert(locked_); +#endif + } + + // Also implement std Lockable + inline void lock() { Lock(); } + inline void unlock() { Unlock(); } + inline bool try_lock() { return TryLock(); } + + // Mutex is move only with lock ownership transfer + Mutex(const Mutex&) = delete; + void operator=(const Mutex&) = delete; + + private: + friend class CondVar; + + std::mutex& getLock() { return mutex_; } + + std::mutex mutex_; +#ifndef NDEBUG + bool locked_; +#endif +}; + +class RWMutex { + public: + RWMutex() { InitializeSRWLock(&srwLock_); } + // No copying allowed + RWMutex(const RWMutex&) = delete; + void operator=(const RWMutex&) = delete; + + void ReadLock() { AcquireSRWLockShared(&srwLock_); } + + void WriteLock() { AcquireSRWLockExclusive(&srwLock_); } + + void ReadUnlock() { ReleaseSRWLockShared(&srwLock_); } + + void WriteUnlock() { ReleaseSRWLockExclusive(&srwLock_); } + + // Empty as in POSIX + void AssertHeld() {} + + private: + SRWLOCK srwLock_; +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu) : mu_(mu) {} + + ~CondVar(); + void Wait(); + bool TimedWait(uint64_t expiration_time); + void Signal(); + void SignalAll(); + + // Condition var is not copy/move constructible + CondVar(const CondVar&) = delete; + CondVar& operator=(const CondVar&) = delete; + + CondVar(CondVar&&) = delete; + CondVar& operator=(CondVar&&) = delete; + + private: + std::condition_variable cv_; + Mutex* mu_; +}; + +#ifdef _POSIX_THREADS +using Thread = std::thread; +#else +// Wrapper around the platform efficient +// or otherwise preferrable implementation +using Thread = WindowsThread; +#endif + +// OnceInit type helps emulate +// Posix semantics with initialization +// adopted in the project +struct OnceType { + struct Init {}; + + OnceType() {} + OnceType(const Init&) {} + OnceType(const OnceType&) = delete; + OnceType& operator=(const OnceType&) = delete; + + std::once_flag flag_; +}; + +#define LEVELDB_ONCE_INIT port::OnceType::Init() +extern void InitOnce(OnceType* once, void (*initializer)()); + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 64U +#endif + +#ifdef ROCKSDB_JEMALLOC +// Separate inlines so they can be replaced if needed +void* jemalloc_aligned_alloc(size_t size, size_t alignment) noexcept; +void jemalloc_aligned_free(void* p) noexcept; +#endif + +inline void* cacheline_aligned_alloc(size_t size) { +#ifdef ROCKSDB_JEMALLOC + return jemalloc_aligned_alloc(size, CACHE_LINE_SIZE); +#else + return _aligned_malloc(size, CACHE_LINE_SIZE); +#endif +} + +inline void cacheline_aligned_free(void* memblock) { +#ifdef ROCKSDB_JEMALLOC + jemalloc_aligned_free(memblock); +#else + _aligned_free(memblock); +#endif +} + +extern const size_t kPageSize; + +// Part of C++11 +#define ALIGN_AS(n) alignas(n) + +static inline void AsmVolatilePause() { +#if defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) + YieldProcessor(); +#endif + // it would be nice to get "wfe" on ARM here +} + +extern int PhysicalCoreID(); + +// For Thread Local Storage abstraction +using pthread_key_t = DWORD; + +inline int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) { + // Not used + (void)destructor; + + pthread_key_t k = TlsAlloc(); + if (TLS_OUT_OF_INDEXES == k) { + return ENOMEM; + } + + *key = k; + return 0; +} + +inline int pthread_key_delete(pthread_key_t key) { + if (!TlsFree(key)) { + return EINVAL; + } + return 0; +} + +inline int pthread_setspecific(pthread_key_t key, const void* value) { + if (!TlsSetValue(key, const_cast<void*>(value))) { + return ENOMEM; + } + return 0; +} + +inline void* pthread_getspecific(pthread_key_t key) { + void* result = TlsGetValue(key); + if (!result) { + if (GetLastError() != ERROR_SUCCESS) { + errno = EINVAL; + } else { + errno = NOERROR; + } + } + return result; +} + +// UNIX equiv although errno numbers will be off +// using C-runtime to implement. Note, this does not +// feel space with zeros in case the file is extended. +int truncate(const char* path, int64_t length); +int Truncate(std::string path, int64_t length); +void Crash(const std::string& srcfile, int srcline); +extern int GetMaxOpenFiles(); +std::string utf16_to_utf8(const std::wstring& utf16); +std::wstring utf8_to_utf16(const std::string& utf8); + +using ThreadId = int; + +extern void SetCpuPriority(ThreadId id, CpuPriority priority); + +int64_t GetProcessID(); + +// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns +// true on success or false on failure. +bool GenerateRfcUuid(std::string* output); + +} // namespace port + +#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES + +#define RX_FILESTRING std::wstring +#define RX_FN(a) ROCKSDB_NAMESPACE::port::utf8_to_utf16(a) +#define FN_TO_RX(a) ROCKSDB_NAMESPACE::port::utf16_to_utf8(a) +#define RX_FNCMP(a, b) ::wcscmp(a, RX_FN(b).c_str()) +#define RX_FNLEN(a) ::wcslen(a) + +#define RX_DeleteFile DeleteFileW +#define RX_CreateFile CreateFileW +#define RX_CreateFileMapping CreateFileMappingW +#define RX_GetFileAttributesEx GetFileAttributesExW +#define RX_FindFirstFileEx FindFirstFileExW +#define RX_FindNextFile FindNextFileW +#define RX_WIN32_FIND_DATA WIN32_FIND_DATAW +#define RX_CreateDirectory CreateDirectoryW +#define RX_RemoveDirectory RemoveDirectoryW +#define RX_GetFileAttributesEx GetFileAttributesExW +#define RX_MoveFileEx MoveFileExW +#define RX_CreateHardLink CreateHardLinkW +#define RX_PathIsRelative PathIsRelativeW +#define RX_GetCurrentDirectory GetCurrentDirectoryW +#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExW +#define RX_PathIsDirectory PathIsDirectoryW + +#else + +#define RX_FILESTRING std::string +#define RX_FN(a) a +#define FN_TO_RX(a) a +#define RX_FNCMP(a, b) strcmp(a, b) +#define RX_FNLEN(a) strlen(a) + +#define RX_DeleteFile DeleteFileA +#define RX_CreateFile CreateFileA +#define RX_CreateFileMapping CreateFileMappingA +#define RX_GetFileAttributesEx GetFileAttributesExA +#define RX_FindFirstFileEx FindFirstFileExA +#define RX_CreateDirectory CreateDirectoryA +#define RX_FindNextFile FindNextFileA +#define RX_WIN32_FIND_DATA WIN32_FIND_DATAA +#define RX_CreateDirectory CreateDirectoryA +#define RX_RemoveDirectory RemoveDirectoryA +#define RX_GetFileAttributesEx GetFileAttributesExA +#define RX_MoveFileEx MoveFileExA +#define RX_CreateHardLink CreateHardLinkA +#define RX_PathIsRelative PathIsRelativeA +#define RX_GetCurrentDirectory GetCurrentDirectoryA +#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExA +#define RX_PathIsDirectory PathIsDirectoryA + +#endif + +using port::pthread_getspecific; +using port::pthread_key_create; +using port::pthread_key_delete; +using port::pthread_key_t; +using port::pthread_setspecific; +using port::truncate; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/win_jemalloc.cc b/src/rocksdb/port/win/win_jemalloc.cc new file mode 100644 index 000000000..cf38f55b7 --- /dev/null +++ b/src/rocksdb/port/win/win_jemalloc.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#ifndef ROCKSDB_JEMALLOC +#error This file can only be part of jemalloc aware build +#endif + +#include <stdexcept> + +#include "jemalloc/jemalloc.h" +#include "port/win/port_win.h" + +#if defined(ZSTD) && defined(ZSTD_STATIC_LINKING_ONLY) +#include <zstd.h> +#if (ZSTD_VERSION_NUMBER >= 500) +namespace ROCKSDB_NAMESPACE { +namespace port { +void* JemallocAllocateForZSTD(void* /* opaque */, size_t size) { + return je_malloc(size); +} +void JemallocDeallocateForZSTD(void* /* opaque */, void* address) { + je_free(address); +} +ZSTD_customMem GetJeZstdAllocationOverrides() { + return {JemallocAllocateForZSTD, JemallocDeallocateForZSTD, nullptr}; +} +} // namespace port +} // namespace ROCKSDB_NAMESPACE +#endif // (ZSTD_VERSION_NUMBER >= 500) +#endif // defined(ZSTD) defined(ZSTD_STATIC_LINKING_ONLY) + +// Global operators to be replaced by a linker when this file is +// a part of the build + +namespace ROCKSDB_NAMESPACE { +namespace port { +void* jemalloc_aligned_alloc(size_t size, size_t alignment) noexcept { + return je_aligned_alloc(alignment, size); +} +void jemalloc_aligned_free(void* p) noexcept { je_free(p); } +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +void* operator new(size_t size) { + void* p = je_malloc(size); + if (!p) { + throw std::bad_alloc(); + } + return p; +} + +void* operator new[](size_t size) { + void* p = je_malloc(size); + if (!p) { + throw std::bad_alloc(); + } + return p; +} + +void operator delete(void* p) { + if (p) { + je_free(p); + } +} + +void operator delete[](void* p) { + if (p) { + je_free(p); + } +} + +#endif diff --git a/src/rocksdb/port/win/win_logger.cc b/src/rocksdb/port/win/win_logger.cc new file mode 100644 index 000000000..072ea419a --- /dev/null +++ b/src/rocksdb/port/win/win_logger.cc @@ -0,0 +1,192 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#if defined(OS_WIN) + +#include "port/win/win_logger.h" + +#include <fcntl.h> +#include <stdio.h> +#include <time.h> + +#include <algorithm> +#include <atomic> + +#include "monitoring/iostats_context_imp.h" +#include "port/sys_time.h" +#include "port/win/env_win.h" +#include "port/win/io_win.h" +#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { + +namespace port { + +WinLogger::WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file, + const InfoLogLevel log_level) + : Logger(log_level), + file_(file), + gettid_(gettid), + log_size_(0), + last_flush_micros_(0), + clock_(clock), + flush_pending_(false) { + assert(file_ != NULL); + assert(file_ != INVALID_HANDLE_VALUE); +} + +void WinLogger::DebugWriter(const char* str, int len) { + assert(file_ != INVALID_HANDLE_VALUE); + DWORD bytesWritten = 0; + BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL); + if (ret == FALSE) { + std::string errSz = GetWindowsErrSz(GetLastError()); + fprintf(stderr, "%s", errSz.c_str()); + } +} + +WinLogger::~WinLogger() { CloseInternal().PermitUncheckedError(); } + +Status WinLogger::CloseImpl() { return CloseInternal(); } + +Status WinLogger::CloseInternal() { + Status s; + if (INVALID_HANDLE_VALUE != file_) { + BOOL ret = FlushFileBuffers(file_); + if (ret == 0) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError); + } + ret = CloseHandle(file_); + // On error the return value is zero + if (ret == 0 && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError); + } + file_ = INVALID_HANDLE_VALUE; + closed_ = true; + } + return s; +} + +void WinLogger::Flush() { + assert(file_ != INVALID_HANDLE_VALUE); + if (flush_pending_) { + flush_pending_ = false; + // With Windows API writes go to OS buffers directly so no fflush needed + // unlike with C runtime API. We don't flush all the way to disk + // for perf reasons. + } + + last_flush_micros_ = clock_->NowMicros(); +} + +void WinLogger::Logv(const char* format, va_list ap) { + IOSTATS_TIMER_GUARD(logger_nanos); + assert(file_ != INVALID_HANDLE_VALUE); + + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + std::unique_ptr<char[]> largeBuffer; + for (int iter = 0; iter < 2; ++iter) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + largeBuffer.reset(new char[bufsize]); + base = largeBuffer.get(); + } + + char* p = base; + char* limit = base + bufsize; + + port::TimeVal now_tv; + port::GetTimeOfDay(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_s(&t, &seconds); + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec), + static_cast<long long unsigned int>(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + int done = vsnprintf(p, limit - p, format, backup_ap); + if (done > 0) { + p += done; + } else { + continue; + } + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + + DWORD bytesWritten = 0; + BOOL ret = WriteFile(file_, base, static_cast<DWORD>(write_size), + &bytesWritten, NULL); + if (ret == FALSE) { + std::string errSz = GetWindowsErrSz(GetLastError()); + fprintf(stderr, "%s", errSz.c_str()); + } + + flush_pending_ = true; + assert((bytesWritten == write_size) || (ret == FALSE)); + if (bytesWritten > 0) { + log_size_ += write_size; + } + + uint64_t now_micros = + static_cast<uint64_t>(now_tv.tv_sec) * 1000000 + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; + // With Windows API writes go to OS buffers directly so no fflush needed + // unlike with C runtime API. We don't flush all the way to disk + // for perf reasons. + last_flush_micros_ = now_micros; + } + break; + } +} + +size_t WinLogger::GetLogFileSize() const { return log_size_; } + +} // namespace port + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/port/win/win_logger.h b/src/rocksdb/port/win/win_logger.h new file mode 100644 index 000000000..1ca4610e9 --- /dev/null +++ b/src/rocksdb/port/win/win_logger.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once + +#include <stdint.h> +#include <windows.h> + +#include <atomic> +#include <memory> + +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; + +namespace port { +class WinLogger : public ROCKSDB_NAMESPACE::Logger { + public: + WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file, + const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL); + + virtual ~WinLogger(); + + WinLogger(const WinLogger&) = delete; + + WinLogger& operator=(const WinLogger&) = delete; + + void Flush() override; + + using ROCKSDB_NAMESPACE::Logger::Logv; + void Logv(const char* format, va_list ap) override; + + size_t GetLogFileSize() const override; + + void DebugWriter(const char* str, int len); + + protected: + Status CloseImpl() override; + + private: + HANDLE file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + std::atomic_size_t log_size_; + std::atomic_uint_fast64_t last_flush_micros_; + SystemClock* clock_; + bool flush_pending_; + + Status CloseInternal(); + + const static uint64_t flush_every_seconds_ = 5; +}; +} // namespace port + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/win_thread.cc b/src/rocksdb/port/win/win_thread.cc new file mode 100644 index 000000000..3c82e736e --- /dev/null +++ b/src/rocksdb/port/win/win_thread.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) +// Most Mingw builds support std::thread only when using posix threads. +// In that case, some of these functions will be unavailable. +// Note that we're using either WindowsThread or std::thread, depending on +// which one is available. +#ifndef _POSIX_THREADS + +#include "port/win/win_thread.h" + +#include <assert.h> +#include <process.h> // __beginthreadex +#include <windows.h> + +#include <stdexcept> +#include <system_error> +#include <thread> + +namespace ROCKSDB_NAMESPACE { +namespace port { + +struct WindowsThread::Data { + std::function<void()> func_; + uintptr_t handle_; + + Data(std::function<void()>&& func) : func_(std::move(func)), handle_(0) {} + + Data(const Data&) = delete; + Data& operator=(const Data&) = delete; + + static unsigned int __stdcall ThreadProc(void* arg); +}; + +void WindowsThread::Init(std::function<void()>&& func) { + data_ = std::make_shared<Data>(std::move(func)); + // We create another instance of std::shared_ptr to get an additional ref + // since we may detach and destroy this instance before the threadproc + // may start to run. We choose to allocate this additional ref on the heap + // so we do not need to synchronize and allow this thread to proceed + std::unique_ptr<std::shared_ptr<Data>> th_data( + new std::shared_ptr<Data>(data_)); + + data_->handle_ = _beginthreadex(NULL, + 0, // stack size + &Data::ThreadProc, th_data.get(), + 0, // init flag + &th_id_); + + if (data_->handle_ == 0) { + throw std::system_error( + std::make_error_code(std::errc::resource_unavailable_try_again), + "Unable to create a thread"); + } + th_data.release(); +} + +WindowsThread::WindowsThread() : data_(nullptr), th_id_(0) {} + +WindowsThread::~WindowsThread() { + // Must be joined or detached + // before destruction. + // This is the same as std::thread + if (data_) { + if (joinable()) { + assert(false); + std::terminate(); + } + data_.reset(); + } +} + +WindowsThread::WindowsThread(WindowsThread&& o) noexcept : WindowsThread() { + *this = std::move(o); +} + +WindowsThread& WindowsThread::operator=(WindowsThread&& o) noexcept { + if (joinable()) { + assert(false); + std::terminate(); + } + + data_ = std::move(o.data_); + + // Per spec both instances will have the same id + th_id_ = o.th_id_; + + return *this; +} + +bool WindowsThread::joinable() const { return (data_ && data_->handle_ != 0); } + +WindowsThread::native_handle_type WindowsThread::native_handle() const { + return reinterpret_cast<native_handle_type>(data_->handle_); +} + +unsigned WindowsThread::hardware_concurrency() { + return std::thread::hardware_concurrency(); +} + +void WindowsThread::join() { + if (!joinable()) { + assert(false); + throw std::system_error(std::make_error_code(std::errc::invalid_argument), + "Thread is no longer joinable"); + } + + if (GetThreadId(GetCurrentThread()) == th_id_) { + assert(false); + throw std::system_error( + std::make_error_code(std::errc::resource_deadlock_would_occur), + "Can not join itself"); + } + + auto ret = + WaitForSingleObject(reinterpret_cast<HANDLE>(data_->handle_), INFINITE); + if (ret != WAIT_OBJECT_0) { + auto lastError = GetLastError(); + assert(false); + throw std::system_error(static_cast<int>(lastError), std::system_category(), + "WaitForSingleObjectFailed: thread join"); + } + + BOOL rc +#if defined(_MSC_VER) + = FALSE; +#else + __attribute__((__unused__)); +#endif + rc = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_)); + assert(rc != 0); + data_->handle_ = 0; +} + +bool WindowsThread::detach() { + if (!joinable()) { + assert(false); + throw std::system_error(std::make_error_code(std::errc::invalid_argument), + "Thread is no longer available"); + } + + BOOL ret = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_)); + data_->handle_ = 0; + + return (ret != 0); +} + +void WindowsThread::swap(WindowsThread& o) { + data_.swap(o.data_); + std::swap(th_id_, o.th_id_); +} + +unsigned int __stdcall WindowsThread::Data::ThreadProc(void* arg) { + auto ptr = reinterpret_cast<std::shared_ptr<Data>*>(arg); + std::unique_ptr<std::shared_ptr<Data>> data(ptr); + (*data)->func_(); + return 0; +} +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif // !_POSIX_THREADS +#endif // OS_WIN diff --git a/src/rocksdb/port/win/win_thread.h b/src/rocksdb/port/win/win_thread.h new file mode 100644 index 000000000..916033b77 --- /dev/null +++ b/src/rocksdb/port/win/win_thread.h @@ -0,0 +1,117 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef _POSIX_THREADS + +#include <functional> +#include <memory> +#include <type_traits> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// This class is a replacement for std::thread +// 2 reasons we do not like std::thread: +// -- is that it dynamically allocates its internals that are automatically +// freed when the thread terminates and not on the destruction of the +// object. This makes it difficult to control the source of memory +// allocation +// - This implements Pimpl so we can easily replace the guts of the +// object in our private version if necessary. +class WindowsThread { + struct Data; + + std::shared_ptr<Data> data_; + unsigned int th_id_; + + void Init(std::function<void()>&&); + + public: + using native_handle_type = void*; + + // Construct with no thread + WindowsThread(); + + // Template constructor + // + // This templated constructor accomplishes several things + // + // - Allows the class as whole to be not a template + // + // - take "universal" references to support both _lvalues and _rvalues + // + // - because this constructor is a catchall case in many respects it + // may prevent us from using both the default __ctor, the move __ctor. + // Also it may circumvent copy __ctor deletion. To work around this + // we make sure this one has at least one argument and eliminate + // it from the overload selection when WindowsThread is the first + // argument. + // + // - construct with Fx(Ax...) with a variable number of types/arguments. + // + // - Gathers together the callable object with its arguments and constructs + // a single callable entity + // + // - Makes use of std::function to convert it to a specification-template + // dependent type that both checks the signature conformance to ensure + // that all of the necessary arguments are provided and allows pimpl + // implementation. + template <class Fn, class... Args, + class = typename std::enable_if<!std::is_same< + typename std::decay<Fn>::type, WindowsThread>::value>::type> + explicit WindowsThread(Fn&& fx, Args&&... ax) : WindowsThread() { + // Use binder to create a single callable entity + auto binder = std::bind(std::forward<Fn>(fx), std::forward<Args>(ax)...); + // Use std::function to take advantage of the type erasure + // so we can still hide implementation within pimpl + // This also makes sure that the binder signature is compliant + std::function<void()> target = binder; + + Init(std::move(target)); + } + + ~WindowsThread(); + + WindowsThread(const WindowsThread&) = delete; + + WindowsThread& operator=(const WindowsThread&) = delete; + + WindowsThread(WindowsThread&&) noexcept; + + WindowsThread& operator=(WindowsThread&&) noexcept; + + bool joinable() const; + + unsigned int get_id() const { return th_id_; } + + native_handle_type native_handle() const; + + static unsigned hardware_concurrency(); + + void join(); + + bool detach(); + + void swap(WindowsThread&); +}; +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +namespace std { +inline void swap(ROCKSDB_NAMESPACE::port::WindowsThread& th1, + ROCKSDB_NAMESPACE::port::WindowsThread& th2) { + th1.swap(th2); +} +} // namespace std + +#endif // !_POSIX_THREADS diff --git a/src/rocksdb/port/win/xpress_win.cc b/src/rocksdb/port/win/xpress_win.cc new file mode 100644 index 000000000..21904d502 --- /dev/null +++ b/src/rocksdb/port/win/xpress_win.cc @@ -0,0 +1,210 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include "port/win/xpress_win.h" + +#include <windows.h> + +#include <cassert> +#include <iostream> +#include <limits> +#include <memory> + +#ifdef XPRESS + +// Put this under ifdef so windows systems w/o this +// can still build +#include <compressapi.h> + +namespace ROCKSDB_NAMESPACE { +namespace port { +namespace xpress { + +// Helpers +namespace { + +auto CloseCompressorFun = [](void* h) { + if (NULL != h) { + ::CloseCompressor(reinterpret_cast<COMPRESSOR_HANDLE>(h)); + } +}; + +auto CloseDecompressorFun = [](void* h) { + if (NULL != h) { + ::CloseDecompressor(reinterpret_cast<DECOMPRESSOR_HANDLE>(h)); + } +}; +} // namespace + +bool Compress(const char* input, size_t length, std::string* output) { + assert(input != nullptr); + assert(output != nullptr); + + if (length == 0) { + output->clear(); + return true; + } + + COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr; + + COMPRESSOR_HANDLE compressor = NULL; + + BOOL success = + CreateCompressor(COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &compressor); // Handle + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to create Compressor LastError: " + << GetLastError() << std::endl; +#endif + return false; + } + + std::unique_ptr<void, decltype(CloseCompressorFun)> compressorGuard( + compressor, CloseCompressorFun); + + SIZE_T compressedBufferSize = 0; + + // Query compressed buffer size. + success = ::Compress(compressor, // Compressor Handle + const_cast<char*>(input), // Input buffer + length, // Uncompressed data size + NULL, // Compressed Buffer + 0, // Compressed Buffer size + &compressedBufferSize); // Compressed Data size + + if (!success) { + auto lastError = GetLastError(); + + if (lastError != ERROR_INSUFFICIENT_BUFFER) { +#ifdef _DEBUG + std::cerr + << "XPRESS: Failed to estimate compressed buffer size LastError " + << lastError << std::endl; +#endif + return false; + } + } + + assert(compressedBufferSize > 0); + + std::string result; + result.resize(compressedBufferSize); + + SIZE_T compressedDataSize = 0; + + // Compress + success = ::Compress(compressor, // Compressor Handle + const_cast<char*>(input), // Input buffer + length, // Uncompressed data size + &result[0], // Compressed Buffer + compressedBufferSize, // Compressed Buffer size + &compressedDataSize); // Compressed Data size + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to compress LastError " << GetLastError() + << std::endl; +#endif + return false; + } + + result.resize(compressedDataSize); + output->swap(result); + + return true; +} + +char* Decompress(const char* input_data, size_t input_length, + size_t* uncompressed_size) { + assert(input_data != nullptr); + assert(uncompressed_size != nullptr); + + if (input_length == 0) { + return nullptr; + } + + COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr; + + DECOMPRESSOR_HANDLE decompressor = NULL; + + BOOL success = + CreateDecompressor(COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &decompressor); // Handle + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to create Decompressor LastError " + << GetLastError() << std::endl; +#endif + return nullptr; + } + + std::unique_ptr<void, decltype(CloseDecompressorFun)> compressorGuard( + decompressor, CloseDecompressorFun); + + SIZE_T decompressedBufferSize = 0; + + success = ::Decompress(decompressor, // Compressor Handle + const_cast<char*>(input_data), // Compressed data + input_length, // Compressed data size + NULL, // Buffer set to NULL + 0, // Buffer size set to 0 + &decompressedBufferSize); // Decompressed Data size + + if (!success) { + auto lastError = GetLastError(); + + if (lastError != ERROR_INSUFFICIENT_BUFFER) { +#ifdef _DEBUG + std::cerr + << "XPRESS: Failed to estimate decompressed buffer size LastError " + << lastError << std::endl; +#endif + return nullptr; + } + } + + assert(decompressedBufferSize > 0); + + // The callers are deallocating using delete[] + // thus we must allocate with new[] + std::unique_ptr<char[]> outputBuffer(new char[decompressedBufferSize]); + + SIZE_T decompressedDataSize = 0; + + success = ::Decompress(decompressor, const_cast<char*>(input_data), + input_length, outputBuffer.get(), + decompressedBufferSize, &decompressedDataSize); + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to decompress LastError " << GetLastError() + << std::endl; +#endif + return nullptr; + } + + *uncompressed_size = decompressedDataSize; + + // Return the raw buffer to the caller supporting the tradition + return outputBuffer.release(); +} +} // namespace xpress +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif + +#endif diff --git a/src/rocksdb/port/win/xpress_win.h b/src/rocksdb/port/win/xpress_win.h new file mode 100644 index 000000000..187adffa6 --- /dev/null +++ b/src/rocksdb/port/win/xpress_win.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <string> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { +namespace xpress { + +bool Compress(const char* input, size_t length, std::string* output); + +char* Decompress(const char* input_data, size_t input_length, + size_t* uncompressed_size); +} // namespace xpress +} // namespace port +} // namespace ROCKSDB_NAMESPACE |