From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/rocksdb/port/README | 10 + src/rocksdb/port/jemalloc_helper.h | 77 ++ src/rocksdb/port/likely.h | 18 + src/rocksdb/port/malloc.h | 17 + src/rocksdb/port/port.h | 21 + src/rocksdb/port/port_dirent.h | 44 + src/rocksdb/port/port_example.h | 101 +++ src/rocksdb/port/port_posix.cc | 234 ++++++ src/rocksdb/port/port_posix.h | 218 +++++ src/rocksdb/port/stack_trace.cc | 138 +++ src/rocksdb/port/stack_trace.h | 22 + src/rocksdb/port/sys_time.h | 47 ++ src/rocksdb/port/util_logger.h | 20 + src/rocksdb/port/win/env_default.cc | 41 + src/rocksdb/port/win/env_win.cc | 1540 ++++++++++++++++++++++++++++++++++ src/rocksdb/port/win/env_win.h | 347 ++++++++ src/rocksdb/port/win/io_win.cc | 1069 +++++++++++++++++++++++ src/rocksdb/port/win/io_win.h | 456 ++++++++++ src/rocksdb/port/win/port_win.cc | 269 ++++++ src/rocksdb/port/win/port_win.h | 398 +++++++++ src/rocksdb/port/win/win_jemalloc.cc | 75 ++ src/rocksdb/port/win/win_logger.cc | 192 +++++ src/rocksdb/port/win/win_logger.h | 66 ++ src/rocksdb/port/win/win_thread.cc | 179 ++++ src/rocksdb/port/win/win_thread.h | 122 +++ src/rocksdb/port/win/xpress_win.cc | 226 +++++ src/rocksdb/port/win/xpress_win.h | 27 + src/rocksdb/port/xpress.h | 17 + 28 files changed, 5991 insertions(+) create mode 100644 src/rocksdb/port/README create mode 100644 src/rocksdb/port/jemalloc_helper.h create mode 100644 src/rocksdb/port/likely.h create mode 100644 src/rocksdb/port/malloc.h create mode 100644 src/rocksdb/port/port.h create mode 100644 src/rocksdb/port/port_dirent.h create mode 100644 src/rocksdb/port/port_example.h create mode 100644 src/rocksdb/port/port_posix.cc create mode 100644 src/rocksdb/port/port_posix.h create mode 100644 src/rocksdb/port/stack_trace.cc create mode 100644 src/rocksdb/port/stack_trace.h create mode 100644 src/rocksdb/port/sys_time.h create mode 100644 src/rocksdb/port/util_logger.h create mode 100644 src/rocksdb/port/win/env_default.cc create mode 100644 src/rocksdb/port/win/env_win.cc create mode 100644 src/rocksdb/port/win/env_win.h create mode 100644 src/rocksdb/port/win/io_win.cc create mode 100644 src/rocksdb/port/win/io_win.h create mode 100644 src/rocksdb/port/win/port_win.cc create mode 100644 src/rocksdb/port/win/port_win.h create mode 100644 src/rocksdb/port/win/win_jemalloc.cc create mode 100644 src/rocksdb/port/win/win_logger.cc create mode 100644 src/rocksdb/port/win/win_logger.h create mode 100644 src/rocksdb/port/win/win_thread.cc create mode 100644 src/rocksdb/port/win/win_thread.h create mode 100644 src/rocksdb/port/win/xpress_win.cc create mode 100644 src/rocksdb/port/win/xpress_win.h create mode 100644 src/rocksdb/port/xpress.h (limited to 'src/rocksdb/port') diff --git a/src/rocksdb/port/README b/src/rocksdb/port/README new file mode 100644 index 000000000..422563e25 --- /dev/null +++ b/src/rocksdb/port/README @@ -0,0 +1,10 @@ +This directory contains interfaces and implementations that isolate the +rest of the package from platform details. + +Code in the rest of the package includes "port.h" from this directory. +"port.h" in turn includes a platform specific "port_.h" file +that provides the platform specific implementation. + +See port_posix.h for an example of what must be provided in a platform +specific header file. + diff --git a/src/rocksdb/port/jemalloc_helper.h b/src/rocksdb/port/jemalloc_helper.h new file mode 100644 index 000000000..f6f72f8cb --- /dev/null +++ b/src/rocksdb/port/jemalloc_helper.h @@ -0,0 +1,77 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if defined(__clang__) +// glibc's `posix_memalign()` declaration specifies `throw()` while clang's +// declaration does not. There is a hack in clang to make its re-declaration +// compatible with glibc's if they are declared consecutively. That hack breaks +// if yet another `posix_memalign()` declaration comes between glibc's and +// clang's declarations. Include "mm_malloc.h" here ensures glibc's and clang's +// declarations both come before "jemalloc.h"'s `posix_memalign()` declaration. +// +// This problem could also be avoided if "jemalloc.h"'s `posix_memalign()` +// declaration did not specify `throw()` when built with clang. +#include +#endif + +#ifdef ROCKSDB_JEMALLOC +#ifdef __FreeBSD__ +#include +#else +#define JEMALLOC_MANGLE +#include +#endif + +#ifndef JEMALLOC_CXX_THROW +#define JEMALLOC_CXX_THROW +#endif + +#if defined(OS_WIN) && defined(_MSC_VER) + +// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is +// defined, Jemalloc memory allocator is used. +static inline bool HasJemalloc() { return true; } + +#else + +// Declare non-standard jemalloc APIs as weak symbols. We can null-check these +// symbols to detect whether jemalloc is linked with the binary. +extern "C" void* mallocx(size_t, int) __attribute__((__weak__)); +extern "C" void* rallocx(void*, size_t, int) __attribute__((__weak__)); +extern "C" size_t xallocx(void*, size_t, size_t, int) __attribute__((__weak__)); +extern "C" size_t sallocx(const void*, int) __attribute__((__weak__)); +extern "C" void dallocx(void*, int) __attribute__((__weak__)); +extern "C" void sdallocx(void*, size_t, int) __attribute__((__weak__)); +extern "C" size_t nallocx(size_t, int) __attribute__((__weak__)); +extern "C" int mallctl(const char*, void*, size_t*, void*, size_t) + __attribute__((__weak__)); +extern "C" int mallctlnametomib(const char*, size_t*, size_t*) + __attribute__((__weak__)); +extern "C" int mallctlbymib(const size_t*, size_t, void*, size_t*, void*, + size_t) __attribute__((__weak__)); +extern "C" void malloc_stats_print(void (*)(void*, const char*), void*, + const char*) __attribute__((__weak__)); +extern "C" size_t malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*) + JEMALLOC_CXX_THROW __attribute__((__weak__)); + +// Check if Jemalloc is linked with the binary. Note the main program might be +// using a different memory allocator even this method return true. +// It is loosely based on folly::usingJEMalloc(), minus the check that actually +// allocate memory and see if it is through jemalloc, to handle the dlopen() +// case: +// https://github.com/facebook/folly/blob/76cf8b5841fb33137cfbf8b224f0226437c855bc/folly/memory/Malloc.h#L147 +static inline bool HasJemalloc() { + return mallocx != nullptr && rallocx != nullptr && xallocx != nullptr && + sallocx != nullptr && dallocx != nullptr && sdallocx != nullptr && + nallocx != nullptr && mallctl != nullptr && + mallctlnametomib != nullptr && mallctlbymib != nullptr && + malloc_stats_print != nullptr && malloc_usable_size != nullptr; +} + +#endif + +#endif // ROCKSDB_JEMALLOC diff --git a/src/rocksdb/port/likely.h b/src/rocksdb/port/likely.h new file mode 100644 index 000000000..397d75713 --- /dev/null +++ b/src/rocksdb/port/likely.h @@ -0,0 +1,18 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#if defined(__GNUC__) && __GNUC__ >= 4 +#define LIKELY(x) (__builtin_expect((x), 1)) +#define UNLIKELY(x) (__builtin_expect((x), 0)) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif diff --git a/src/rocksdb/port/malloc.h b/src/rocksdb/port/malloc.h new file mode 100644 index 000000000..f973263e2 --- /dev/null +++ b/src/rocksdb/port/malloc.h @@ -0,0 +1,17 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include +#else +#include +#endif // OS_FREEBSD +#endif // ROCKSDB_MALLOC_USABLE_SIZE diff --git a/src/rocksdb/port/port.h b/src/rocksdb/port/port.h new file mode 100644 index 000000000..13aa56d47 --- /dev/null +++ b/src/rocksdb/port/port.h @@ -0,0 +1,21 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. +#if defined(ROCKSDB_PLATFORM_POSIX) +#include "port/port_posix.h" +#elif defined(OS_WIN) +#include "port/win/port_win.h" +#endif diff --git a/src/rocksdb/port/port_dirent.h b/src/rocksdb/port/port_dirent.h new file mode 100644 index 000000000..192abec4d --- /dev/null +++ b/src/rocksdb/port/port_dirent.h @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#pragma once + +#ifdef ROCKSDB_PLATFORM_POSIX +#include +#include +#elif defined(OS_WIN) + +namespace ROCKSDB_NAMESPACE { +namespace port { + +struct dirent { + char d_name[_MAX_PATH]; /* filename */ +}; + +struct DIR; + +DIR* opendir(const char* name); + +dirent* readdir(DIR* dirp); + +int closedir(DIR* dirp); + +} // namespace port + +using port::dirent; +using port::DIR; +using port::opendir; +using port::readdir; +using port::closedir; + +} // namespace ROCKSDB_NAMESPACE + +#endif // OS_WIN diff --git a/src/rocksdb/port/port_example.h b/src/rocksdb/port/port_example.h new file mode 100644 index 000000000..b906bbec7 --- /dev/null +++ b/src/rocksdb/port/port_example.h @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This file contains the specification, but not the implementations, +// of the types/operations/etc. that should be defined by a platform +// specific port_.h file. Use this file as a reference for +// how to port this package to a new platform. + +#pragma once + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// TODO(jorlow): Many of these belong more in the environment class rather than +// here. We should try moving them and see if it affects perf. + +// The following boolean constant must be true on a little-endian machine +// and false otherwise. +static const bool kLittleEndian = true /* or some other expression */; + +// ------------------ Threading ------------------- + +// A Mutex represents an exclusive lock. +class Mutex { + public: + Mutex(); + ~Mutex(); + + // Lock the mutex. Waits until other lockers have exited. + // Will deadlock if the mutex is already locked by this thread. + void Lock(); + + // Unlock the mutex. + // REQUIRES: This mutex was locked by this thread. + void Unlock(); + + // Optionally crash if this thread does not hold this mutex. + // The implementation must be fast, especially if NDEBUG is + // defined. The implementation is allowed to skip all checks. + void AssertHeld(); +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + + // Atomically release *mu and block on this condition variable until + // either a call to SignalAll(), or a call to Signal() that picks + // this thread to wakeup. + // REQUIRES: this thread holds *mu + void Wait(); + + // If there are some threads waiting, wake up at least one of them. + void Signal(); + + // Wake up all waiting threads. + void SignallAll(); +}; + +// Thread-safe initialization. +// Used as follows: +// static port::OnceType init_control = LEVELDB_ONCE_INIT; +// static void Initializer() { ... do something ...; } +// ... +// port::InitOnce(&init_control, &Initializer); +typedef intptr_t OnceType; +#define LEVELDB_ONCE_INIT 0 +extern void InitOnce(port::OnceType*, void (*initializer)()); + +// ------------------ Compression ------------------- + +// Store the snappy compression of "input[0,input_length-1]" in *output. +// Returns false if snappy is not supported by this port. +extern bool Snappy_Compress(const char* input, size_t input_length, + std::string* output); + +// If input[0,input_length-1] looks like a valid snappy compressed +// buffer, store the size of the uncompressed data in *result and +// return true. Else return false. +extern bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result); + +// Attempt to snappy uncompress input[0,input_length-1] into *output. +// Returns true if successful, false if the input is invalid lightweight +// compressed data. +// +// REQUIRES: at least the first "n" bytes of output[] must be writable +// where "n" is the result of a successful call to +// Snappy_GetUncompressedLength. +extern bool Snappy_Uncompress(const char* input_data, size_t input_length, + char* output); + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/port_posix.cc b/src/rocksdb/port/port_posix.cc new file mode 100644 index 000000000..e3ea5ca69 --- /dev/null +++ b/src/rocksdb/port/port_posix.cc @@ -0,0 +1,234 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/port_posix.h" + +#include +#if defined(__i386__) || defined(__x86_64__) +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "logging/logging.h" + +namespace ROCKSDB_NAMESPACE { + +// We want to give users opportunity to default all the mutexes to adaptive if +// not specified otherwise. This enables a quick way to conduct various +// performance related experiements. +// +// NB! Support for adaptive mutexes is turned on by definining +// ROCKSDB_PTHREAD_ADAPTIVE_MUTEX during the compilation. If you use RocksDB +// build environment then this happens automatically; otherwise it's up to the +// consumer to define the identifier. +#ifdef ROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX +extern const bool kDefaultToAdaptiveMutex = true; +#else +extern const bool kDefaultToAdaptiveMutex = false; +#endif + +namespace port { + +static int PthreadCall(const char* label, int result) { + if (result != 0 && result != ETIMEDOUT) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + abort(); + } + return result; +} + +Mutex::Mutex(bool adaptive) { + (void) adaptive; +#ifdef ROCKSDB_PTHREAD_ADAPTIVE_MUTEX + if (!adaptive) { + PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); + } else { + pthread_mutexattr_t mutex_attr; + PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr)); + PthreadCall("set mutex attr", + pthread_mutexattr_settype(&mutex_attr, + PTHREAD_MUTEX_ADAPTIVE_NP)); + PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr)); + PthreadCall("destroy mutex attr", + pthread_mutexattr_destroy(&mutex_attr)); + } +#else + PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); +#endif // ROCKSDB_PTHREAD_ADAPTIVE_MUTEX +} + +Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } + +void Mutex::Lock() { + PthreadCall("lock", pthread_mutex_lock(&mu_)); +#ifndef NDEBUG + locked_ = true; +#endif +} + +void Mutex::Unlock() { +#ifndef NDEBUG + locked_ = false; +#endif + PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void Mutex::AssertHeld() { +#ifndef NDEBUG + assert(locked_); +#endif +} + +CondVar::CondVar(Mutex* mu) + : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, nullptr)); +} + +CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } + +void CondVar::Wait() { +#ifndef NDEBUG + mu_->locked_ = false; +#endif + PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); +#ifndef NDEBUG + mu_->locked_ = true; +#endif +} + +bool CondVar::TimedWait(uint64_t abs_time_us) { + struct timespec ts; + ts.tv_sec = static_cast(abs_time_us / 1000000); + ts.tv_nsec = static_cast((abs_time_us % 1000000) * 1000); + +#ifndef NDEBUG + mu_->locked_ = false; +#endif + int err = pthread_cond_timedwait(&cv_, &mu_->mu_, &ts); +#ifndef NDEBUG + mu_->locked_ = true; +#endif + if (err == ETIMEDOUT) { + return true; + } + if (err != 0) { + PthreadCall("timedwait", err); + } + return false; +} + +void CondVar::Signal() { + PthreadCall("signal", pthread_cond_signal(&cv_)); +} + +void CondVar::SignalAll() { + PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); +} + +RWMutex::RWMutex() { + PthreadCall("init mutex", pthread_rwlock_init(&mu_, nullptr)); +} + +RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); } + +void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); } + +void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); } + +void RWMutex::ReadUnlock() { PthreadCall("read unlock", pthread_rwlock_unlock(&mu_)); } + +void RWMutex::WriteUnlock() { PthreadCall("write unlock", pthread_rwlock_unlock(&mu_)); } + +int PhysicalCoreID() { +#if defined(ROCKSDB_SCHED_GETCPU_PRESENT) && defined(__x86_64__) && \ + (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 22)) + // sched_getcpu uses VDSO getcpu() syscall since 2.22. I believe Linux offers VDSO + // support only on x86_64. This is the fastest/preferred method if available. + int cpuno = sched_getcpu(); + if (cpuno < 0) { + return -1; + } + return cpuno; +#elif defined(__x86_64__) || defined(__i386__) + // clang/gcc both provide cpuid.h, which defines __get_cpuid(), for x86_64 and i386. + unsigned eax, ebx = 0, ecx, edx; + if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { + return -1; + } + return ebx >> 24; +#else + // give up, the caller can generate a random number or something. + return -1; +#endif +} + +void InitOnce(OnceType* once, void (*initializer)()) { + PthreadCall("once", pthread_once(once, initializer)); +} + +void Crash(const std::string& srcfile, int srcline) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + kill(getpid(), SIGTERM); +} + +int GetMaxOpenFiles() { +#if defined(RLIMIT_NOFILE) + struct rlimit no_files_limit; + if (getrlimit(RLIMIT_NOFILE, &no_files_limit) != 0) { + return -1; + } + // protect against overflow + if (static_cast(no_files_limit.rlim_cur) >= + static_cast(std::numeric_limits::max())) { + return std::numeric_limits::max(); + } + return static_cast(no_files_limit.rlim_cur); +#endif + return -1; +} + +void *cacheline_aligned_alloc(size_t size) { +#if __GNUC__ < 5 && defined(__SANITIZE_ADDRESS__) + return malloc(size); +#elif ( _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__)) + void *m; + errno = posix_memalign(&m, CACHE_LINE_SIZE, size); + return errno ? nullptr : m; +#else + return malloc(size); +#endif +} + +void cacheline_aligned_free(void *memblock) { + free(memblock); +} + +static size_t GetPageSize() { +#if defined(OS_LINUX) || defined(_SC_PAGESIZE) + long v = sysconf(_SC_PAGESIZE); + if (v >= 1024) { + return static_cast(v); + } +#endif + // Default assume 4KB + return 4U * 1024U; +} + +const size_t kPageSize = GetPageSize(); + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/port_posix.h b/src/rocksdb/port/port_posix.h new file mode 100644 index 000000000..0c9c69833 --- /dev/null +++ b/src/rocksdb/port/port_posix.h @@ -0,0 +1,218 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +// size_t printf formatting named in the manner of C99 standard formatting +// strings such as PRIu64 +// in fact, we could use that one +#define ROCKSDB_PRIszt "zu" + +#define __declspec(S) + +#define ROCKSDB_NOEXCEPT noexcept + +#undef PLATFORM_IS_LITTLE_ENDIAN +#if defined(OS_MACOSX) + #include + #if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER) + #define PLATFORM_IS_LITTLE_ENDIAN \ + (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN) + #endif +#elif defined(OS_SOLARIS) + #include + #ifdef _LITTLE_ENDIAN + #define PLATFORM_IS_LITTLE_ENDIAN true + #else + #define PLATFORM_IS_LITTLE_ENDIAN false + #endif + #include +#elif defined(OS_AIX) + #include + #include + #define PLATFORM_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN) + #include +#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || \ + defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) + #include + #include + #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN) +#else + #include +#endif +#include + +#include +#include +#include +#include + +#ifndef PLATFORM_IS_LITTLE_ENDIAN +#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) +#endif + +#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ + defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\ + defined(OS_ANDROID) || defined(CYGWIN) || defined(OS_AIX) +// Use fread/fwrite/fflush on platforms without _unlocked variants +#define fread_unlocked fread +#define fwrite_unlocked fwrite +#define fflush_unlocked fflush +#endif + +#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\ + defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) +// Use fsync() on platforms without fdatasync() +#define fdatasync fsync +#endif + +#if defined(OS_ANDROID) && __ANDROID_API__ < 9 +// fdatasync() was only introduced in API level 9 on Android. Use fsync() +// when targeting older platforms. +#define fdatasync fsync +#endif + +namespace ROCKSDB_NAMESPACE { + +extern const bool kDefaultToAdaptiveMutex; + +namespace port { + +// For use at db/file_indexer.h kLevelMaxIndex +const uint32_t kMaxUint32 = std::numeric_limits::max(); +const int kMaxInt32 = std::numeric_limits::max(); +const int kMinInt32 = std::numeric_limits::min(); +const uint64_t kMaxUint64 = std::numeric_limits::max(); +const int64_t kMaxInt64 = std::numeric_limits::max(); +const int64_t kMinInt64 = std::numeric_limits::min(); +const size_t kMaxSizet = std::numeric_limits::max(); + +constexpr bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN; +#undef PLATFORM_IS_LITTLE_ENDIAN + +class CondVar; + +class Mutex { + public: + explicit Mutex(bool adaptive = kDefaultToAdaptiveMutex); + // No copying + Mutex(const Mutex&) = delete; + void operator=(const Mutex&) = delete; + + ~Mutex(); + + void Lock(); + void Unlock(); + // this will assert if the mutex is not locked + // it does NOT verify that mutex is held by a calling thread + void AssertHeld(); + + private: + friend class CondVar; + pthread_mutex_t mu_; +#ifndef NDEBUG + bool locked_; +#endif +}; + +class RWMutex { + public: + RWMutex(); + // No copying allowed + RWMutex(const RWMutex&) = delete; + void operator=(const RWMutex&) = delete; + + ~RWMutex(); + + void ReadLock(); + void WriteLock(); + void ReadUnlock(); + void WriteUnlock(); + void AssertHeld() { } + + private: + pthread_rwlock_t mu_; // the underlying platform mutex +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu); + ~CondVar(); + void Wait(); + // Timed condition wait. Returns true if timeout occurred. + bool TimedWait(uint64_t abs_time_us); + void Signal(); + void SignalAll(); + private: + pthread_cond_t cv_; + Mutex* mu_; +}; + +using Thread = std::thread; + +static inline void AsmVolatilePause() { +#if defined(__i386__) || defined(__x86_64__) + asm volatile("pause"); +#elif defined(__aarch64__) + asm volatile("wfe"); +#elif defined(__powerpc64__) + asm volatile("or 27,27,27"); +#endif + // it's okay for other platforms to be no-ops +} + +// Returns -1 if not available on this platform +extern int PhysicalCoreID(); + +typedef pthread_once_t OnceType; +#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT +extern void InitOnce(OnceType* once, void (*initializer)()); + +#ifndef CACHE_LINE_SIZE +// To test behavior with non-native cache line size, e.g. for +// Bloom filters, set TEST_CACHE_LINE_SIZE to the desired test size. +// This disables ALIGN_AS to keep it from failing compilation. +#ifdef TEST_CACHE_LINE_SIZE +#define CACHE_LINE_SIZE TEST_CACHE_LINE_SIZE +#define ALIGN_AS(n) /*empty*/ +#else +#if defined(__s390__) +#define CACHE_LINE_SIZE 256U +#elif defined(__powerpc__) || defined(__aarch64__) +#define CACHE_LINE_SIZE 128U +#else +#define CACHE_LINE_SIZE 64U +#endif +#define ALIGN_AS(n) alignas(n) +#endif +#endif + +static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0, + "Cache line size must be a power of 2 number of bytes"); + +extern void *cacheline_aligned_alloc(size_t size); + +extern void cacheline_aligned_free(void *memblock); + +#define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality) + +extern void Crash(const std::string& srcfile, int srcline); + +extern int GetMaxOpenFiles(); + +extern const size_t kPageSize; + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/stack_trace.cc b/src/rocksdb/port/stack_trace.cc new file mode 100644 index 000000000..32eb45bfb --- /dev/null +++ b/src/rocksdb/port/stack_trace.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "port/stack_trace.h" + +#if defined(ROCKSDB_LITE) || !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || \ + defined(CYGWIN) || defined(OS_FREEBSD) || defined(OS_SOLARIS) + +// noop + +namespace ROCKSDB_NAMESPACE { +namespace port { +void InstallStackTraceHandler() {} +void PrintStack(int /*first_frames_to_skip*/) {} +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#else + +#include +#include +#include +#include +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { +namespace port { + +namespace { + +#if defined(OS_LINUX) || defined(OS_FREEBSD) +const char* GetExecutableName() { + static char name[1024]; + + char link[1024]; + snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); + auto read = readlink(link, name, sizeof(name) - 1); + if (-1 == read) { + return nullptr; + } else { + name[read] = 0; + return name; + } +} + +void PrintStackTraceLine(const char* symbol, void* frame) { + static const char* executable = GetExecutableName(); + if (symbol) { + fprintf(stderr, "%s ", symbol); + } + if (executable) { + // out source to addr2line, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + snprintf(cmd, kLineMax, "addr2line %p -e %s -f -C 2>&1", frame, executable); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } + } else { + fprintf(stderr, " %p", frame); + } + + fprintf(stderr, "\n"); +} +#elif defined(OS_MACOSX) + +void PrintStackTraceLine(const char* symbol, void* frame) { + static int pid = getpid(); + // out source to atos, for the address translation + const int kLineMax = 256; + char cmd[kLineMax]; + snprintf(cmd, kLineMax, "xcrun atos %p -p %d 2>&1", frame, pid); + auto f = popen(cmd, "r"); + if (f) { + char line[kLineMax]; + while (fgets(line, sizeof(line), f)) { + line[strlen(line) - 1] = 0; // remove newline + fprintf(stderr, "%s\t", line); + } + pclose(f); + } else if (symbol) { + fprintf(stderr, "%s ", symbol); + } + + fprintf(stderr, "\n"); +} + +#endif + +} // namespace + +void PrintStack(int first_frames_to_skip) { + const int kMaxFrames = 100; + void* frames[kMaxFrames]; + + auto num_frames = backtrace(frames, kMaxFrames); + auto symbols = backtrace_symbols(frames, num_frames); + + for (int i = first_frames_to_skip; i < num_frames; ++i) { + fprintf(stderr, "#%-2d ", i - first_frames_to_skip); + PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]); + } + free(symbols); +} + +static void StackTraceHandler(int sig) { + // reset to default handler + signal(sig, SIG_DFL); + fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); + // skip the top three signal handler related frames + PrintStack(3); + // re-signal to default handler (so we still get core dump if needed...) + raise(sig); +} + +void InstallStackTraceHandler() { + // just use the plain old signal as it's simple and sufficient + // for this use case + signal(SIGILL, StackTraceHandler); + signal(SIGSEGV, StackTraceHandler); + signal(SIGBUS, StackTraceHandler); + signal(SIGABRT, StackTraceHandler); +} + +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/port/stack_trace.h b/src/rocksdb/port/stack_trace.h new file mode 100644 index 000000000..4924f5b1b --- /dev/null +++ b/src/rocksdb/port/stack_trace.h @@ -0,0 +1,22 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// Install a signal handler to print callstack on the following signals: +// SIGILL SIGSEGV SIGBUS SIGABRT +// Currently supports linux only. No-op otherwise. +void InstallStackTraceHandler(); + +// Prints stack, skips skip_first_frames frames +void PrintStack(int first_frames_to_skip = 0); + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/sys_time.h b/src/rocksdb/port/sys_time.h new file mode 100644 index 000000000..1c7d206a7 --- /dev/null +++ b/src/rocksdb/port/sys_time.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This file is a portable substitute for sys/time.h which does not exist on +// Windows + +#pragma once + +#if defined(OS_WIN) && defined(_MSC_VER) + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +namespace port { + +// Avoid including winsock2.h for this definition +typedef struct timeval { + long tv_sec; + long tv_usec; +} timeval; + +void gettimeofday(struct timeval* tv, struct timezone* tz); + +inline struct tm* localtime_r(const time_t* timep, struct tm* result) { + errno_t ret = localtime_s(result, timep); + return (ret == 0) ? result : NULL; +} +} + +using port::timeval; +using port::gettimeofday; +using port::localtime_r; +} // namespace ROCKSDB_NAMESPACE + +#else +#include +#include +#endif diff --git a/src/rocksdb/port/util_logger.h b/src/rocksdb/port/util_logger.h new file mode 100644 index 000000000..d2d62a987 --- /dev/null +++ b/src/rocksdb/port/util_logger.h @@ -0,0 +1,20 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +// Include the appropriate platform specific file below. If you are +// porting to a new platform, see "port_example.h" for documentation +// of what the new port_.h file must provide. + +#if defined(ROCKSDB_PLATFORM_POSIX) +#include "logging/posix_logger.h" +#elif defined(OS_WIN) +#include "port/win/win_logger.h" +#endif diff --git a/src/rocksdb/port/win/env_default.cc b/src/rocksdb/port/win/env_default.cc new file mode 100644 index 000000000..663779dde --- /dev/null +++ b/src/rocksdb/port/win/env_default.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include +#include "port/win/env_win.h" +#include "test_util/sync_point.h" +#include "util/compression_context_cache.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// We choose not to destroy the env because joining the threads from the +// system loader +// which destroys the statics (same as from DLLMain) creates a system loader +// dead-lock. +// in this manner any remaining threads are terminated OK. +namespace { + std::once_flag winenv_once_flag; + Env* envptr; +}; +} + +Env* Env::Default() { + using namespace port; + ThreadLocalPtr::InitSingletons(); + CompressionContextCache::InitSingleton(); + INIT_SYNC_POINT_SINGLETONS(); + std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); }); + return envptr; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/env_win.cc b/src/rocksdb/port/win/env_win.cc new file mode 100644 index 000000000..295bb2b7e --- /dev/null +++ b/src/rocksdb/port/win/env_win.cc @@ -0,0 +1,1540 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/win/env_win.h" +#include "port/win/win_thread.h" +#include +#include +#include + +#include +#include // _getpid +#include // _access +#include // _rmdir, _mkdir, _getcwd +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/slice.h" + +#include "port/port.h" +#include "port/port_dirent.h" +#include "port/win/win_logger.h" +#include "port/win/io_win.h" + +#include "monitoring/iostats_context_imp.h" + +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" + +#include // for uuid generation +#include +#include +#include "strsafe.h" + +#include + +namespace ROCKSDB_NAMESPACE { + +ThreadStatusUpdater* CreateThreadStatusUpdater() { + return new ThreadStatusUpdater(); +} + +namespace { + +// Sector size used when physical sector size cannot be obtained from device. +static const size_t kSectorSize = 512; + +// RAII helpers for HANDLEs +const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); }; +typedef std::unique_ptr UniqueCloseHandlePtr; + +const auto FindCloseFunc = [](HANDLE h) { ::FindClose(h); }; +typedef std::unique_ptr UniqueFindClosePtr; + +void WinthreadCall(const char* label, std::error_code result) { + if (0 != result.value()) { + fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value())); + abort(); + } +} + +} + +namespace port { + +WinEnvIO::WinEnvIO(Env* hosted_env) + : hosted_env_(hosted_env), + page_size_(4 * 1024), + allocation_granularity_(page_size_), + perf_counter_frequency_(0), + nano_seconds_per_period_(0), + GetSystemTimePreciseAsFileTime_(NULL) { + + SYSTEM_INFO sinfo; + GetSystemInfo(&sinfo); + + page_size_ = sinfo.dwPageSize; + allocation_granularity_ = sinfo.dwAllocationGranularity; + + { + LARGE_INTEGER qpf; + BOOL ret __attribute__((__unused__)); + ret = QueryPerformanceFrequency(&qpf); + assert(ret == TRUE); + perf_counter_frequency_ = qpf.QuadPart; + + if (std::nano::den % perf_counter_frequency_ == 0) { + nano_seconds_per_period_ = std::nano::den / perf_counter_frequency_; + } + } + + HMODULE module = GetModuleHandle("kernel32.dll"); + if (module != NULL) { + GetSystemTimePreciseAsFileTime_ = + (FnGetSystemTimePreciseAsFileTime)GetProcAddress( + module, "GetSystemTimePreciseAsFileTime"); + } +} + +WinEnvIO::~WinEnvIO() { +} + +Status WinEnvIO::DeleteFile(const std::string& fname) { + Status result; + + BOOL ret = RX_DeleteFile(RX_FN(fname).c_str()); + + if(!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to delete: " + fname, + lastError); + } + + return result; +} + +Status WinEnvIO::Truncate(const std::string& fname, size_t size) { + Status s; + int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size); + if (result != 0) { + s = IOError("Failed to truncate: " + fname, errno); + } + return s; +} + +Status WinEnvIO::GetCurrentTime(int64_t* unix_time) { + time_t time = std::time(nullptr); + if (time == (time_t)(-1)) { + return Status::NotSupported("Failed to get time"); + } + + *unix_time = time; + return Status::OK(); +} + +Status WinEnvIO::NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + Status s; + + result->reset(); + + // Corruption test needs to rename and delete files of these kind + // while they are still open with another handle. For that reason we + // allow share_write and delete(allows rename). + HANDLE hFile = INVALID_HANDLE_VALUE; + + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; + + if (options.use_direct_reads && !options.use_mmap_reads) { + fileFlags |= FILE_FLAG_NO_BUFFERING; + } + + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, + OPEN_EXISTING, // Original fopen mode is "rb" + fileFlags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname, + lastError); + } else { + result->reset(new WinSequentialFile(fname, hFile, options)); + } + return s; +} + +Status WinEnvIO::NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + result->reset(); + Status s; + + // Open the file for read-only random access + // Random access is to disable read-ahead as the system reads too much data + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; + + if (options.use_direct_reads && !options.use_mmap_reads) { + fileFlags |= FILE_FLAG_NO_BUFFERING; + } else { + fileFlags |= FILE_FLAG_RANDOM_ACCESS; + } + + /// Shared access is necessary for corruption test to pass + // almost all tests would work with a possible exception of fault_injection + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, OPEN_EXISTING, fileFlags, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "NewRandomAccessFile failed to Create/Open: " + fname, lastError); + } + + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); + + // CAUTION! This will map the entire file into the process address space + if (options.use_mmap_reads && sizeof(void*) >= 8) { + // Use mmap when virtual address-space is plentiful. + uint64_t fileSize; + + s = GetFileSize(fname, &fileSize); + + if (s.ok()) { + // Will not map empty files + if (fileSize == 0) { + return IOError( + "NewRandomAccessFile failed to map empty file: " + fname, EINVAL); + } + + HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY, + 0, // At its present length + 0, + NULL); // Mapping name + + if (!hMap) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create file mapping for NewRandomAccessFile: " + fname, + lastError); + } + + UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); + + const void* mapped_region = + MapViewOfFileEx(hMap, FILE_MAP_READ, + 0, // High DWORD of access start + 0, // Low DWORD + static_cast(fileSize), + NULL); // Let the OS choose the mapping + + if (!mapped_region) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to MapViewOfFile for NewRandomAccessFile: " + fname, + lastError); + } + + result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region, + static_cast(fileSize))); + + mapGuard.release(); + fileGuard.release(); + } + } else { + result->reset(new WinRandomAccessFile(fname, hFile, + std::max(GetSectorSize(fname), + page_size_), + options)); + fileGuard.release(); + } + return s; +} + +Status WinEnvIO::OpenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options, + bool reopen) { + + const size_t c_BufferCapacity = 64 * 1024; + + EnvOptions local_options(options); + + result->reset(); + Status s; + + DWORD fileFlags = FILE_ATTRIBUTE_NORMAL; + + if (local_options.use_direct_writes && !local_options.use_mmap_writes) { + fileFlags = FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; + } + + // Desired access. We are want to write only here but if we want to memory + // map + // the file then there is no write only mode so we have to create it + // Read/Write + // However, MapViewOfFile specifies only Write only + DWORD desired_access = GENERIC_WRITE; + DWORD shared_mode = FILE_SHARE_READ; + + if (local_options.use_mmap_writes) { + desired_access |= GENERIC_READ; + } else { + // Adding this solely for tests to pass (fault_injection_test, + // wal_manager_test). + shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE); + } + + // This will always truncate the file + DWORD creation_disposition = CREATE_ALWAYS; + if (reopen) { + creation_disposition = OPEN_ALWAYS; + } + + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), + desired_access, // Access desired + shared_mode, + NULL, // Security attributes + // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC + creation_disposition, + fileFlags, // Flags + NULL); // Template File + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create a NewWriteableFile: " + fname, lastError); + } + + // We will start writing at the end, appending + if (reopen) { + LARGE_INTEGER zero_move; + zero_move.QuadPart = 0; + BOOL ret = SetFilePointerEx(hFile, zero_move, NULL, FILE_END); + if (!ret) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create a ReopenWritableFile move to the end: " + fname, + lastError); + } + } + + if (options.use_mmap_writes) { + // We usually do not use mmmapping on SSD and thus we pass memory + // page_size + result->reset(new WinMmapFile(fname, hFile, page_size_, + allocation_granularity_, local_options)); + } else { + // Here we want the buffer allocation to be aligned by the SSD page size + // and to be a multiple of it + result->reset(new WinWritableFile(fname, hFile, + std::max(GetSectorSize(fname), + GetPageSize()), + c_BufferCapacity, local_options)); + } + return s; +} + +Status WinEnvIO::NewRandomRWFile(const std::string & fname, + std::unique_ptr* result, + const EnvOptions & options) { + + Status s; + + // Open the file for read-only random access + // Random access is to disable read-ahead as the system reads too much data + DWORD desired_access = GENERIC_READ | GENERIC_WRITE; + DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; + DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist + DWORD file_flags = FILE_FLAG_RANDOM_ACCESS; + + if (options.use_direct_reads && options.use_direct_writes) { + file_flags |= FILE_FLAG_NO_BUFFERING; + } + + /// Shared access is necessary for corruption test to pass + // almost all tests would work with a possible exception of fault_injection + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = + RX_CreateFile(RX_FN(fname).c_str(), + desired_access, + shared_mode, + NULL, // Security attributes + creation_disposition, + file_flags, + NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "NewRandomRWFile failed to Create/Open: " + fname, lastError); + } + + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); + result->reset(new WinRandomRWFile(fname, hFile, + std::max(GetSectorSize(fname), + GetPageSize()), + options)); + fileGuard.release(); + + return s; +} + +Status WinEnvIO::NewMemoryMappedFileBuffer( + const std::string & fname, + std::unique_ptr* result) { + Status s; + result->reset(); + + DWORD fileFlags = FILE_ATTRIBUTE_READONLY; + + HANDLE hFile = INVALID_HANDLE_VALUE; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, + OPEN_EXISTING, // Open only if it exists + fileFlags, + NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to open NewMemoryMappedFileBuffer: " + fname, lastError); + return s; + } + UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); + + uint64_t fileSize = 0; + s = GetFileSize(fname, &fileSize); + if (!s.ok()) { + return s; + } + // Will not map empty files + if (fileSize == 0) { + return Status::NotSupported( + "NewMemoryMappedFileBuffer can not map zero length files: " + fname); + } + + // size_t is 32-bit with 32-bit builds + if (fileSize > std::numeric_limits::max()) { + return Status::NotSupported( + "The specified file size does not fit into 32-bit memory addressing: " + + fname); + } + + HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE, + 0, // Whole file at its present length + 0, + NULL); // Mapping name + + if (!hMap) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to create file mapping for: " + fname, lastError); + } + UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); + + void* base = MapViewOfFileEx(hMap, FILE_MAP_WRITE, + 0, // High DWORD of access start + 0, // Low DWORD + static_cast(fileSize), + NULL); // Let the OS choose the mapping + + if (!base) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError( + "Failed to MapViewOfFile for NewMemoryMappedFileBuffer: " + fname, + lastError); + } + + result->reset(new WinMemoryMappedBuffer(hFile, hMap, base, + static_cast(fileSize))); + + mapGuard.release(); + fileGuard.release(); + + return s; +} + +Status WinEnvIO::NewDirectory(const std::string& name, + std::unique_ptr* result) { + Status s; + // Must be nullptr on failure + result->reset(); + + if (!DirExists(name)) { + s = IOErrorFromWindowsError( + "open folder: " + name, ERROR_DIRECTORY); + return s; + } + + HANDLE handle = INVALID_HANDLE_VALUE; + // 0 - for access means read metadata + { + IOSTATS_TIMER_GUARD(open_nanos); + handle = RX_CreateFile( + RX_FN(name).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + } + + if (INVALID_HANDLE_VALUE == handle) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("open folder: " + name, lastError); + return s; + } + + result->reset(new WinDirectory(handle)); + + return s; +} + +Status WinEnvIO::FileExists(const std::string& fname) { + Status s; + // TODO: This does not follow symbolic links at this point + // which is consistent with _access() impl on windows + // but can be added + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (FALSE == RX_GetFileAttributesEx(RX_FN(fname).c_str(), + GetFileExInfoStandard, &attrs)) { + auto lastError = GetLastError(); + switch (lastError) { + case ERROR_ACCESS_DENIED: + case ERROR_NOT_FOUND: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + s = Status::NotFound(); + break; + default: + s = IOErrorFromWindowsError("Unexpected error for: " + fname, + lastError); + break; + } + } + return s; +} + +Status WinEnvIO::GetChildren(const std::string& dir, + std::vector* result) { + + Status status; + result->clear(); + std::vector output; + + RX_WIN32_FIND_DATA data; + memset(&data, 0, sizeof(data)); + std::string pattern(dir); + pattern.append("\\").append("*"); + + HANDLE handle = RX_FindFirstFileEx(RX_FN(pattern).c_str(), + // Do not want alternative name + FindExInfoBasic, + &data, + FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); + + if (handle == INVALID_HANDLE_VALUE) { + auto lastError = GetLastError(); + switch (lastError) { + case ERROR_NOT_FOUND: + case ERROR_ACCESS_DENIED: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + status = Status::NotFound(); + break; + default: + status = IOErrorFromWindowsError( + "Failed to GetChhildren for: " + dir, lastError); + } + return status; + } + + UniqueFindClosePtr fc(handle, FindCloseFunc); + + if (result->capacity() > 0) { + output.reserve(result->capacity()); + } + + // For safety + data.cFileName[MAX_PATH - 1] = 0; + + while (true) { + auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName)); + output.emplace_back(FN_TO_RX(x)); + BOOL ret =- RX_FindNextFile(handle, &data); + // If the function fails the return value is zero + // and non-zero otherwise. Not TRUE or FALSE. + if (ret == FALSE) { + // Posix does not care why we stopped + break; + } + data.cFileName[MAX_PATH - 1] = 0; + } + output.swap(*result); + return status; +} + +Status WinEnvIO::CreateDir(const std::string& name) { + Status result; + BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL); + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError( + "Failed to create a directory: " + name, lastError); + } + + return result; +} + +Status WinEnvIO::CreateDirIfMissing(const std::string& name) { + Status result; + + if (DirExists(name)) { + return result; + } + + BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL); + if (!ret) { + auto lastError = GetLastError(); + if (lastError != ERROR_ALREADY_EXISTS) { + result = IOErrorFromWindowsError( + "Failed to create a directory: " + name, lastError); + } else { + result = + Status::IOError(name + ": exists but is not a directory"); + } + } + return result; +} + +Status WinEnvIO::DeleteDir(const std::string& name) { + Status result; + BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str()); + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to remove dir: " + name, + lastError); + } + return result; +} + +Status WinEnvIO::GetFileSize(const std::string& fname, + uint64_t* size) { + Status s; + + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, + &attrs)) { + ULARGE_INTEGER file_size; + file_size.HighPart = attrs.nFileSizeHigh; + file_size.LowPart = attrs.nFileSizeLow; + *size = file_size.QuadPart; + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError); + } + return s; +} + +uint64_t WinEnvIO::FileTimeToUnixTime(const FILETIME& ftTime) { + const uint64_t c_FileTimePerSecond = 10000000U; + // UNIX epoch starts on 1970-01-01T00:00:00Z + // Windows FILETIME starts on 1601-01-01T00:00:00Z + // Therefore, we need to subtract the below number of seconds from + // the seconds that we obtain from FILETIME with an obvious loss of + // precision + const uint64_t c_SecondBeforeUnixEpoch = 11644473600U; + + ULARGE_INTEGER li; + li.HighPart = ftTime.dwHighDateTime; + li.LowPart = ftTime.dwLowDateTime; + + uint64_t result = + (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch; + return result; +} + +Status WinEnvIO::GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + Status s; + + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, + &attrs)) { + *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime); + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Can not get file modification time for: " + fname, lastError); + *file_mtime = 0; + } + + return s; +} + +Status WinEnvIO::RenameFile(const std::string& src, + const std::string& target) { + Status result; + + // rename() is not capable of replacing the existing file as on Linux + // so use OS API directly + if (!RX_MoveFileEx(RX_FN(src).c_str(), RX_FN(target).c_str(), + MOVEFILE_REPLACE_EXISTING)) { + DWORD lastError = GetLastError(); + + std::string text("Failed to rename: "); + text.append(src).append(" to: ").append(target); + + result = IOErrorFromWindowsError(text, lastError); + } + + return result; +} + +Status WinEnvIO::LinkFile(const std::string& src, + const std::string& target) { + Status result; + + if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) { + DWORD lastError = GetLastError(); + if (lastError == ERROR_NOT_SAME_DEVICE) { + return Status::NotSupported("No cross FS links allowed"); + } + + std::string text("Failed to link: "); + text.append(src).append(" to: ").append(target); + + result = IOErrorFromWindowsError(text, lastError); + } + + return result; +} + +Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) { + Status s; + HANDLE handle = RX_CreateFile( + RX_FN(fname).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL); + + if (INVALID_HANDLE_VALUE == handle) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("NumFileLinks: " + fname, lastError); + return s; + } + UniqueCloseHandlePtr handle_guard(handle, CloseHandleFunc); + FILE_STANDARD_INFO standard_info; + if (0 != GetFileInformationByHandleEx(handle, FileStandardInfo, + &standard_info, + sizeof(standard_info))) { + *count = standard_info.NumberOfLinks; + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("GetFileInformationByHandleEx: " + fname, + lastError); + } + return s; +} + +Status WinEnvIO::AreFilesSame(const std::string& first, + const std::string& second, bool* res) { +// For MinGW builds +#if (_WIN32_WINNT == _WIN32_WINNT_VISTA) + Status s = Status::NotSupported(); +#else + assert(res != nullptr); + Status s; + if (res == nullptr) { + s = Status::InvalidArgument("res"); + return s; + } + + // 0 - for access means read metadata + HANDLE file_1 = RX_CreateFile( + RX_FN(first).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + + if (INVALID_HANDLE_VALUE == file_1) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("open file: " + first, lastError); + return s; + } + UniqueCloseHandlePtr g_1(file_1, CloseHandleFunc); + + HANDLE file_2 = RX_CreateFile( + RX_FN(second).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + NULL); + + if (INVALID_HANDLE_VALUE == file_2) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("open file: " + second, lastError); + return s; + } + UniqueCloseHandlePtr g_2(file_2, CloseHandleFunc); + + FILE_ID_INFO FileInfo_1; + BOOL result = GetFileInformationByHandleEx(file_1, FileIdInfo, &FileInfo_1, + sizeof(FileInfo_1)); + + if (!result) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("stat file: " + first, lastError); + return s; + } + + FILE_ID_INFO FileInfo_2; + result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2, + sizeof(FileInfo_2)); + + if (!result) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("stat file: " + second, lastError); + return s; + } + + if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) { + *res = (0 == memcmp(FileInfo_1.FileId.Identifier, + FileInfo_2.FileId.Identifier, + sizeof(FileInfo_1.FileId.Identifier))); + } else { + *res = false; + } +#endif + return s; +} + +Status WinEnvIO::LockFile(const std::string& lockFname, + FileLock** lock) { + assert(lock != nullptr); + + *lock = NULL; + Status result; + + // No-sharing, this is a LOCK file + const DWORD ExclusiveAccessON = 0; + + // Obtain exclusive access to the LOCK file + // Previously, instead of NORMAL attr we set DELETE on close and that worked + // well except with fault_injection test that insists on deleting it. + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile(RX_FN(lockFname).c_str(), + (GENERIC_READ | GENERIC_WRITE), + ExclusiveAccessON, NULL, CREATE_ALWAYS, + FILE_ATTRIBUTE_NORMAL, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError( + "Failed to create lock file: " + lockFname, lastError); + } else { + *lock = new WinFileLock(hFile); + } + + return result; +} + +Status WinEnvIO::UnlockFile(FileLock* lock) { + Status result; + + assert(lock != nullptr); + + delete lock; + + return result; +} + +Status WinEnvIO::GetTestDirectory(std::string* result) { + + std::string output; + + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + output = env; + } else { + env = getenv("TMP"); + + if (env && env[0] != '\0') { + output = env; + } else { + output = "c:\\tmp"; + } + } + CreateDir(output); + + output.append("\\testrocksdb-"); + output.append(std::to_string(_getpid())); + + CreateDir(output); + + output.swap(*result); + + return Status::OK(); +} + +Status WinEnvIO::NewLogger(const std::string& fname, + std::shared_ptr* result) { + Status s; + + result->reset(); + + HANDLE hFile = 0; + { + IOSTATS_TIMER_GUARD(open_nanos); + hFile = RX_CreateFile( + RX_FN(fname).c_str(), GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_DELETE, // In RocksDb log files are + // renamed and deleted before + // they are closed. This enables + // doing so. + NULL, + CREATE_ALWAYS, // Original fopen mode is "w" + FILE_ATTRIBUTE_NORMAL, NULL); + } + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError); + } else { + { + // With log files we want to set the true creation time as of now + // because the system + // for some reason caches the attributes of the previous file that just + // been renamed from + // this name so auto_roll_logger_test fails + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + // Set creation, last access and last write time to the same value + SetFileTime(hFile, &ft, &ft, &ft); + } + result->reset(new WinLogger(&WinEnvThreads::gettid, hosted_env_, hFile)); + } + return s; +} + +uint64_t WinEnvIO::NowMicros() { + + if (GetSystemTimePreciseAsFileTime_ != NULL) { + // all std::chrono clocks on windows proved to return + // values that may repeat that is not good enough for some uses. + const int64_t c_UnixEpochStartTicks = 116444736000000000LL; + const int64_t c_FtToMicroSec = 10; + + // This interface needs to return system time and not + // just any microseconds because it is often used as an argument + // to TimedWait() on condition variable + FILETIME ftSystemTime; + GetSystemTimePreciseAsFileTime_(&ftSystemTime); + + LARGE_INTEGER li; + li.LowPart = ftSystemTime.dwLowDateTime; + li.HighPart = ftSystemTime.dwHighDateTime; + // Subtract unix epoch start + li.QuadPart -= c_UnixEpochStartTicks; + // Convert to microsecs + li.QuadPart /= c_FtToMicroSec; + return li.QuadPart; + } + using namespace std::chrono; + return duration_cast(system_clock::now().time_since_epoch()) + .count(); +} + +uint64_t WinEnvIO::NowNanos() { + if (nano_seconds_per_period_ != 0) { + // all std::chrono clocks on windows have the same resolution that is only + // good enough for microseconds but not nanoseconds + // On Windows 8 and Windows 2012 Server + // GetSystemTimePreciseAsFileTime(¤t_time) can be used + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + // Convert performance counter to nanoseconds by precomputed ratio. + // Directly multiply nano::den with li.QuadPart causes overflow. + // Only do this when nano::den is divisible by perf_counter_frequency_, + // which most likely is the case in reality. If it's not, fall back to + // high_resolution_clock, which may be less precise under old compilers. + li.QuadPart *= nano_seconds_per_period_; + return li.QuadPart; + } + using namespace std::chrono; + return duration_cast( + high_resolution_clock::now().time_since_epoch()).count(); +} + +Status WinEnvIO::GetHostName(char* name, uint64_t len) { + Status s; + DWORD nSize = static_cast( + std::min(len, std::numeric_limits::max())); + + if (!::GetComputerNameA(name, &nSize)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("GetHostName", lastError); + } else { + name[nSize] = 0; + } + + return s; +} + +Status WinEnvIO::GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + // Check if we already have an absolute path + // For test compatibility we will consider starting slash as an + // absolute path + if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) || + !RX_PathIsRelative(RX_FN(db_path).c_str())) { + *output_path = db_path; + return Status::OK(); + } + + RX_FILESTRING result; + result.resize(MAX_PATH); + + // Hopefully no changes the current directory while we do this + // however _getcwd also suffers from the same limitation + DWORD len = RX_GetCurrentDirectory(MAX_PATH, &result[0]); + if (len == 0) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError("Failed to get current working directory", + lastError); + } + + result.resize(len); + std::string res = FN_TO_RX(result); + + res.swap(*output_path); + return Status::OK(); +} + +std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) { + std::string result; + + const time_t seconds = secondsSince1970; + const int maxsize = 64; + + struct tm t; + errno_t ret = localtime_s(&t, &seconds); + + if (ret) { + result = std::to_string(seconds); + } else { + result.resize(maxsize); + char* p = &result[0]; + + int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec); + assert(len > 0); + + result.resize(len); + } + + return result; +} + +Status WinEnvIO::GetFreeSpace(const std::string& path, uint64_t* diskfree) { + assert(diskfree != nullptr); + ULARGE_INTEGER freeBytes; + BOOL f = RX_GetDiskFreeSpaceEx(RX_FN(path).c_str(), &freeBytes, NULL, NULL); + if (f) { + *diskfree = freeBytes.QuadPart; + return Status::OK(); + } else { + DWORD lastError = GetLastError(); + return IOErrorFromWindowsError("Failed to get free space: " + path, + lastError); + } +} + +EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const { + EnvOptions optimized(env_options); + // These two the same as default optimizations + optimized.bytes_per_sync = db_options.wal_bytes_per_sync; + optimized.writable_file_max_buffer_size = + db_options.writable_file_max_buffer_size; + + // This adversely affects %999 on windows + optimized.use_mmap_writes = false; + // Direct writes will produce a huge perf impact on + // Windows. Pre-allocate space for WAL. + optimized.use_direct_writes = false; + return optimized; +} + +EnvOptions WinEnvIO::OptimizeForManifestWrite( + const EnvOptions& env_options) const { + EnvOptions optimized(env_options); + optimized.use_mmap_writes = false; + optimized.use_direct_reads = false; + return optimized; +} + +EnvOptions WinEnvIO::OptimizeForManifestRead( + const EnvOptions& env_options) const { + EnvOptions optimized(env_options); + optimized.use_mmap_writes = false; + optimized.use_direct_reads = false; + return optimized; +} + +// Returns true iff the named directory exists and is a directory. +bool WinEnvIO::DirExists(const std::string& dname) { + WIN32_FILE_ATTRIBUTE_DATA attrs; + if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), + GetFileExInfoStandard, &attrs)) { + return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY); + } + return false; +} + +size_t WinEnvIO::GetSectorSize(const std::string& fname) { + size_t sector_size = kSectorSize; + + if (RX_PathIsRelative(RX_FN(fname).c_str())) { + return sector_size; + } + + // obtain device handle + char devicename[7] = "\\\\.\\"; + int erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2); + + if (erresult) { + assert(false); + return sector_size; + } + + HANDLE hDevice = CreateFile(devicename, 0, 0, nullptr, OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, nullptr); + + if (hDevice == INVALID_HANDLE_VALUE) { + return sector_size; + } + + STORAGE_PROPERTY_QUERY spropertyquery; + spropertyquery.PropertyId = StorageAccessAlignmentProperty; + spropertyquery.QueryType = PropertyStandardQuery; + + BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)]; + DWORD output_bytes = 0; + + BOOL ret = DeviceIoControl(hDevice, IOCTL_STORAGE_QUERY_PROPERTY, + &spropertyquery, sizeof(spropertyquery), + output_buffer, + sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), + &output_bytes, nullptr); + + if (ret) { + sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR *)output_buffer)->BytesPerLogicalSector; + } else { + // many devices do not support StorageProcessAlignmentProperty. Any failure here and we + // fall back to logical alignment + + DISK_GEOMETRY_EX geometry = { 0 }; + ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, + nullptr, 0, &geometry, sizeof(geometry), &output_bytes, nullptr); + if (ret) { + sector_size = geometry.Geometry.BytesPerSector; + } + } + + if (hDevice != INVALID_HANDLE_VALUE) { + CloseHandle(hDevice); + } + + return sector_size; +} + +//////////////////////////////////////////////////////////////////////// +// WinEnvThreads + +WinEnvThreads::WinEnvThreads(Env* hosted_env) + : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) { + + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].SetThreadPriority( + static_cast(pool_id)); + // This allows later initializing the thread-local-env of each thread. + thread_pools_[pool_id].SetHostEnv(hosted_env); + } +} + +WinEnvThreads::~WinEnvThreads() { + + WaitForJoin(); + + for (auto& thpool : thread_pools_) { + thpool.JoinAllThreads(); + } +} + +void WinEnvThreads::Schedule(void(*function)(void*), void* arg, + Env::Priority pri, void* tag, + void(*unschedFunction)(void* arg)) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); +} + +int WinEnvThreads::UnSchedule(void* arg, Env::Priority pri) { + return thread_pools_[pri].UnSchedule(arg); +} + +namespace { + + struct StartThreadState { + void(*user_function)(void*); + void* arg; + }; + + void* StartThreadWrapper(void* arg) { + std::unique_ptr state( + reinterpret_cast(arg)); + state->user_function(state->arg); + return nullptr; + } + +} + +void WinEnvThreads::StartThread(void(*function)(void* arg), void* arg) { + std::unique_ptr state(new StartThreadState); + state->user_function = function; + state->arg = arg; + try { + ROCKSDB_NAMESPACE::port::WindowsThread th(&StartThreadWrapper, state.get()); + state.release(); + + std::lock_guard lg(mu_); + threads_to_join_.push_back(std::move(th)); + + } catch (const std::system_error& ex) { + WinthreadCall("start thread", ex.code()); + } +} + +void WinEnvThreads::WaitForJoin() { + for (auto& th : threads_to_join_) { + th.join(); + } + threads_to_join_.clear(); +} + +unsigned int WinEnvThreads::GetThreadPoolQueueLen(Env::Priority pri) const { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].GetQueueLen(); +} + +uint64_t WinEnvThreads::gettid() { + uint64_t thread_id = GetCurrentThreadId(); + return thread_id; +} + +uint64_t WinEnvThreads::GetThreadID() const { return gettid(); } + +void WinEnvThreads::SleepForMicroseconds(int micros) { + std::this_thread::sleep_for(std::chrono::microseconds(micros)); +} + +void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); +} + +int WinEnvThreads::GetBackgroundThreads(Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].GetBackgroundThreads(); +} + +void WinEnvThreads::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); +} + +///////////////////////////////////////////////////////////////////////// +// WinEnv + +WinEnv::WinEnv() : winenv_io_(this), winenv_threads_(this) { + // Protected member of the base class + thread_status_updater_ = CreateThreadStatusUpdater(); +} + + +WinEnv::~WinEnv() { + // All threads must be joined before the deletion of + // thread_status_updater_. + delete thread_status_updater_; +} + +Status WinEnv::GetThreadList(std::vector* thread_list) { + assert(thread_status_updater_); + return thread_status_updater_->GetThreadList(thread_list); +} + +Status WinEnv::DeleteFile(const std::string& fname) { + return winenv_io_.DeleteFile(fname); +} + +Status WinEnv::Truncate(const std::string& fname, size_t size) { + return winenv_io_.Truncate(fname, size); +} + +Status WinEnv::GetCurrentTime(int64_t* unix_time) { + return winenv_io_.GetCurrentTime(unix_time); +} + +Status WinEnv::NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + return winenv_io_.NewSequentialFile(fname, result, options); +} + +Status WinEnv::NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + return winenv_io_.NewRandomAccessFile(fname, result, options); +} + +Status WinEnv::NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + return winenv_io_.OpenWritableFile(fname, result, options, false); +} + +Status WinEnv::ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + return winenv_io_.OpenWritableFile(fname, result, options, true); +} + +Status WinEnv::NewRandomRWFile(const std::string & fname, + std::unique_ptr* result, + const EnvOptions & options) { + return winenv_io_.NewRandomRWFile(fname, result, options); +} + +Status WinEnv::NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) { + return winenv_io_.NewMemoryMappedFileBuffer(fname, result); +} + +Status WinEnv::NewDirectory(const std::string& name, + std::unique_ptr* result) { + return winenv_io_.NewDirectory(name, result); +} + +Status WinEnv::FileExists(const std::string& fname) { + return winenv_io_.FileExists(fname); +} + +Status WinEnv::GetChildren(const std::string& dir, + std::vector* result) { + return winenv_io_.GetChildren(dir, result); +} + +Status WinEnv::CreateDir(const std::string& name) { + return winenv_io_.CreateDir(name); +} + +Status WinEnv::CreateDirIfMissing(const std::string& name) { + return winenv_io_.CreateDirIfMissing(name); +} + +Status WinEnv::DeleteDir(const std::string& name) { + return winenv_io_.DeleteDir(name); +} + +Status WinEnv::GetFileSize(const std::string& fname, + uint64_t* size) { + return winenv_io_.GetFileSize(fname, size); +} + +Status WinEnv::GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) { + return winenv_io_.GetFileModificationTime(fname, file_mtime); +} + +Status WinEnv::RenameFile(const std::string& src, + const std::string& target) { + return winenv_io_.RenameFile(src, target); +} + +Status WinEnv::LinkFile(const std::string& src, + const std::string& target) { + return winenv_io_.LinkFile(src, target); +} + +Status WinEnv::NumFileLinks(const std::string& fname, uint64_t* count) { + return winenv_io_.NumFileLinks(fname, count); +} + +Status WinEnv::AreFilesSame(const std::string& first, + const std::string& second, bool* res) { + return winenv_io_.AreFilesSame(first, second, res); +} + +Status WinEnv::LockFile(const std::string& lockFname, + FileLock** lock) { + return winenv_io_.LockFile(lockFname, lock); +} + +Status WinEnv::UnlockFile(FileLock* lock) { + return winenv_io_.UnlockFile(lock); +} + +Status WinEnv::GetTestDirectory(std::string* result) { + return winenv_io_.GetTestDirectory(result); +} + +Status WinEnv::NewLogger(const std::string& fname, + std::shared_ptr* result) { + return winenv_io_.NewLogger(fname, result); +} + +uint64_t WinEnv::NowMicros() { + return winenv_io_.NowMicros(); +} + +uint64_t WinEnv::NowNanos() { + return winenv_io_.NowNanos(); +} + +Status WinEnv::GetHostName(char* name, uint64_t len) { + return winenv_io_.GetHostName(name, len); +} + +Status WinEnv::GetAbsolutePath(const std::string& db_path, + std::string* output_path) { + return winenv_io_.GetAbsolutePath(db_path, output_path); +} + +std::string WinEnv::TimeToString(uint64_t secondsSince1970) { + return winenv_io_.TimeToString(secondsSince1970); +} + +void WinEnv::Schedule(void(*function)(void*), void* arg, Env::Priority pri, + void* tag, + void(*unschedFunction)(void* arg)) { + return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction); +} + +int WinEnv::UnSchedule(void* arg, Env::Priority pri) { + return winenv_threads_.UnSchedule(arg, pri); +} + +void WinEnv::StartThread(void(*function)(void* arg), void* arg) { + return winenv_threads_.StartThread(function, arg); +} + +void WinEnv::WaitForJoin() { + return winenv_threads_.WaitForJoin(); +} + +unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const { + return winenv_threads_.GetThreadPoolQueueLen(pri); +} + +uint64_t WinEnv::GetThreadID() const { + return winenv_threads_.GetThreadID(); +} + +Status WinEnv::GetFreeSpace(const std::string& path, uint64_t* diskfree) { + return winenv_io_.GetFreeSpace(path, diskfree); +} + +void WinEnv::SleepForMicroseconds(int micros) { + return winenv_threads_.SleepForMicroseconds(micros); +} + +// Allow increasing the number of worker threads. +void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) { + return winenv_threads_.SetBackgroundThreads(num, pri); +} + +int WinEnv::GetBackgroundThreads(Env::Priority pri) { + return winenv_threads_.GetBackgroundThreads(pri); +} + +void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { + return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri); +} + +EnvOptions WinEnv::OptimizeForManifestRead( + const EnvOptions& env_options) const { + return winenv_io_.OptimizeForManifestRead(env_options); +} + +EnvOptions WinEnv::OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const { + return winenv_io_.OptimizeForLogWrite(env_options, db_options); +} + +EnvOptions WinEnv::OptimizeForManifestWrite( + const EnvOptions& env_options) const { + return winenv_io_.OptimizeForManifestWrite(env_options); +} + +} // namespace port + +std::string Env::GenerateUniqueId() { + std::string result; + + UUID uuid; + UuidCreateSequential(&uuid); + + RPC_CSTR rpc_str; + auto status = UuidToStringA(&uuid, &rpc_str); + (void)status; + assert(status == RPC_S_OK); + + result = reinterpret_cast(rpc_str); + + status = RpcStringFreeA(&rpc_str); + assert(status == RPC_S_OK); + + return result; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/env_win.h b/src/rocksdb/port/win/env_win.h new file mode 100644 index 000000000..5dfca58bd --- /dev/null +++ b/src/rocksdb/port/win/env_win.h @@ -0,0 +1,347 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the rocksdb implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#pragma once + +#include "port/win/win_thread.h" +#include +#include "util/threadpool_imp.h" + +#include +#include + +#include +#include +#include + + +#undef GetCurrentTime +#undef DeleteFile +#undef GetTickCount + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// Currently not designed for inheritance but rather a replacement +class WinEnvThreads { +public: + + explicit WinEnvThreads(Env* hosted_env); + + ~WinEnvThreads(); + + WinEnvThreads(const WinEnvThreads&) = delete; + WinEnvThreads& operator=(const WinEnvThreads&) = delete; + + void Schedule(void(*function)(void*), void* arg, Env::Priority pri, + void* tag, void(*unschedFunction)(void* arg)); + + int UnSchedule(void* arg, Env::Priority pri); + + void StartThread(void(*function)(void* arg), void* arg); + + void WaitForJoin(); + + unsigned int GetThreadPoolQueueLen(Env::Priority pri) const; + + static uint64_t gettid(); + + uint64_t GetThreadID() const; + + void SleepForMicroseconds(int micros); + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Env::Priority pri); + int GetBackgroundThreads(Env::Priority pri); + + void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri); + +private: + + Env* hosted_env_; + mutable std::mutex mu_; + std::vector thread_pools_; + std::vector threads_to_join_; + +}; + +// Designed for inheritance so can be re-used +// but certain parts replaced +class WinEnvIO { +public: + explicit WinEnvIO(Env* hosted_env); + + virtual ~WinEnvIO(); + + virtual Status DeleteFile(const std::string& fname); + + Status Truncate(const std::string& fname, size_t size); + + virtual Status GetCurrentTime(int64_t* unix_time); + + virtual Status NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + // Helper for NewWritable and ReopenWritableFile + virtual Status OpenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options, + bool reopen); + + virtual Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + // The returned file will only be accessed by one thread at a time. + virtual Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + + virtual Status NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result); + + virtual Status NewDirectory(const std::string& name, + std::unique_ptr* result); + + virtual Status FileExists(const std::string& fname); + + virtual Status GetChildren(const std::string& dir, + std::vector* result); + + virtual Status CreateDir(const std::string& name); + + virtual Status CreateDirIfMissing(const std::string& name); + + virtual Status DeleteDir(const std::string& name); + + virtual Status GetFileSize(const std::string& fname, uint64_t* size); + + static uint64_t FileTimeToUnixTime(const FILETIME& ftTime); + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime); + + virtual Status RenameFile(const std::string& src, const std::string& target); + + virtual Status LinkFile(const std::string& src, const std::string& target); + + virtual Status NumFileLinks(const std::string& /*fname*/, + uint64_t* /*count*/); + + virtual Status AreFilesSame(const std::string& first, + const std::string& second, bool* res); + + virtual Status LockFile(const std::string& lockFname, FileLock** lock); + + virtual Status UnlockFile(FileLock* lock); + + virtual Status GetTestDirectory(std::string* result); + + virtual Status NewLogger(const std::string& fname, + std::shared_ptr* result); + + virtual uint64_t NowMicros(); + + virtual uint64_t NowNanos(); + + virtual Status GetHostName(char* name, uint64_t len); + + virtual Status GetAbsolutePath(const std::string& db_path, + std::string* output_path); + + // This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + + // Get the amount of free disk space + virtual Status GetFreeSpace(const std::string& path, uint64_t* diskfree); + + virtual std::string TimeToString(uint64_t secondsSince1970); + + virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const; + + virtual EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const; + + virtual EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const; + + size_t GetPageSize() const { return page_size_; } + + size_t GetAllocationGranularity() const { return allocation_granularity_; } + + uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; } + + static size_t GetSectorSize(const std::string& fname); + +private: + // Returns true iff the named directory exists and is a directory. + virtual bool DirExists(const std::string& dname); + + typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME); + + Env* hosted_env_; + size_t page_size_; + size_t allocation_granularity_; + uint64_t perf_counter_frequency_; + uint64_t nano_seconds_per_period_; + FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_; +}; + +class WinEnv : public Env { +public: + WinEnv(); + + ~WinEnv(); + + Status DeleteFile(const std::string& fname) override; + + Status Truncate(const std::string& fname, size_t size) override; + + Status GetCurrentTime(int64_t* unix_time) override; + + Status NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + // The returned file will only be accessed by one thread at a time. + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + Status NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override; + + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override; + + Status FileExists(const std::string& fname) override; + + Status GetChildren(const std::string& dir, + std::vector* result) override; + + Status CreateDir(const std::string& name) override; + + Status CreateDirIfMissing(const std::string& name) override; + + Status DeleteDir(const std::string& name) override; + + Status GetFileSize(const std::string& fname, + uint64_t* size) override; + + Status GetFileModificationTime(const std::string& fname, + uint64_t* file_mtime) override; + + Status RenameFile(const std::string& src, + const std::string& target) override; + + Status LinkFile(const std::string& src, + const std::string& target) override; + + Status NumFileLinks(const std::string& fname, uint64_t* count) override; + + Status AreFilesSame(const std::string& first, + const std::string& second, bool* res) override; + + Status LockFile(const std::string& lockFname, FileLock** lock) override; + + Status UnlockFile(FileLock* lock) override; + + Status GetTestDirectory(std::string* result) override; + + Status NewLogger(const std::string& fname, + std::shared_ptr* result) override; + + uint64_t NowMicros() override; + + uint64_t NowNanos() override; + + Status GetHostName(char* name, uint64_t len) override; + + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) override; + + std::string TimeToString(uint64_t secondsSince1970) override; + + Status GetThreadList(std::vector* thread_list) override; + + void Schedule(void(*function)(void*), void* arg, Env::Priority pri, + void* tag, void(*unschedFunction)(void* arg)) override; + + int UnSchedule(void* arg, Env::Priority pri) override; + + void StartThread(void(*function)(void* arg), void* arg) override; + + void WaitForJoin(); + + unsigned int GetThreadPoolQueueLen(Env::Priority pri) const override; + + uint64_t GetThreadID() const override; + + // This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + + // Get the amount of free disk space + Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override; + + void SleepForMicroseconds(int micros) override; + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Env::Priority pri) override; + int GetBackgroundThreads(Env::Priority pri) override; + + void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override; + + EnvOptions OptimizeForManifestRead( + const EnvOptions& env_options) const override; + + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, + const DBOptions& db_options) const override; + + EnvOptions OptimizeForManifestWrite( + const EnvOptions& env_options) const override; + + +private: + + WinEnvIO winenv_io_; + WinEnvThreads winenv_threads_; +}; + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/io_win.cc b/src/rocksdb/port/win/io_win.cc new file mode 100644 index 000000000..e6c6890ef --- /dev/null +++ b/src/rocksdb/port/win/io_win.cc @@ -0,0 +1,1069 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/win/io_win.h" + +#include "monitoring/iostats_context_imp.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +/* +* DirectIOHelper +*/ +namespace { + +const size_t kSectorSize = 512; + +inline +bool IsPowerOfTwo(const size_t alignment) { + return ((alignment) & (alignment - 1)) == 0; +} + +inline +bool IsSectorAligned(const size_t off) { + return (off & (kSectorSize - 1)) == 0; +} + +inline +bool IsAligned(size_t alignment, const void* ptr) { + return ((uintptr_t(ptr)) & (alignment - 1)) == 0; +} +} + + +std::string GetWindowsErrSz(DWORD err) { + LPSTR lpMsgBuf; + FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, + 0, // Default language + reinterpret_cast(&lpMsgBuf), 0, NULL); + + std::string Err = lpMsgBuf; + LocalFree(lpMsgBuf); + return Err; +} + +// We preserve the original name of this interface to denote the original idea +// behind it. +// All reads happen by a specified offset and pwrite interface does not change +// the position of the file pointer. Judging from the man page and errno it does +// execute +// lseek atomically to return the position of the file back where it was. +// WriteFile() does not +// have this capability. Therefore, for both pread and pwrite the pointer is +// advanced to the next position +// which is fine for writes because they are (should be) sequential. +// Because all the reads/writes happen by the specified offset, the caller in +// theory should not +// rely on the current file offset. +Status pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written) { + + Status s; + bytes_written = 0; + + size_t num_bytes = data.size(); + if (num_bytes > std::numeric_limits::max()) { + // May happen in 64-bit builds where size_t is 64-bits but + // long is still 32-bit, but that's the API here at the moment + return Status::InvalidArgument("num_bytes is too large for a single write: " + + file_data->GetName()); + } + + OVERLAPPED overlapped = { 0 }; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + DWORD bytesWritten = 0; + + if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast(num_bytes), + &bytesWritten, &overlapped)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(), + lastError); + } else { + bytes_written = bytesWritten; + } + + return s; +} + +// See comments for pwrite above +Status pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read) { + + Status s; + bytes_read = 0; + + if (num_bytes > std::numeric_limits::max()) { + return Status::InvalidArgument("num_bytes is too large for a single read: " + + file_data->GetName()); + } + + OVERLAPPED overlapped = { 0 }; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + DWORD bytesRead = 0; + + if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast(num_bytes), + &bytesRead, &overlapped)) { + auto lastError = GetLastError(); + // EOF is OK with zero bytes read + if (lastError != ERROR_HANDLE_EOF) { + s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(), + lastError); + } + } else { + bytes_read = bytesRead; + } + + return s; +} + +// SetFileInformationByHandle() is capable of fast pre-allocates. +// However, this does not change the file end position unless the file is +// truncated and the pre-allocated space is not considered filled with zeros. +Status fallocate(const std::string& filename, HANDLE hFile, + uint64_t to_size) { + Status status; + + FILE_ALLOCATION_INFO alloc_info; + alloc_info.AllocationSize.QuadPart = to_size; + + if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info, + sizeof(FILE_ALLOCATION_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError( + "Failed to pre-allocate space: " + filename, lastError); + } + + return status; +} + +Status ftruncate(const std::string& filename, HANDLE hFile, + uint64_t toSize) { + Status status; + + FILE_END_OF_FILE_INFO end_of_file; + end_of_file.EndOfFile.QuadPart = toSize; + + if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, + sizeof(FILE_END_OF_FILE_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError("Failed to Set end of file: " + filename, + lastError); + } + + return status; +} + +size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/, + size_t /*max_size*/) { + // Returning 0 is safe as it causes the table reader to generate a unique ID. + // This is suboptimal for performance as it prevents multiple table readers + // for the same file from sharing cached blocks. For example, if users have + // a low value for `max_open_files`, there can be many table readers opened + // for the same file. + // + // TODO: this is a temporarily solution as it is safe but not optimal for + // performance. For more details see discussion in + // https://github.com/facebook/rocksdb/pull/5844. + return 0; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// WinMmapReadableFile + +WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName, + HANDLE hFile, HANDLE hMap, + const void* mapped_region, + size_t length) + : WinFileData(fileName, hFile, false /* use_direct_io */), + hMap_(hMap), + mapped_region_(mapped_region), + length_(length) {} + +WinMmapReadableFile::~WinMmapReadableFile() { + BOOL ret __attribute__((__unused__)); + ret = ::UnmapViewOfFile(mapped_region_); + assert(ret); + + ret = ::CloseHandle(hMap_); + assert(ret); +} + +Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + + if (offset > length_) { + *result = Slice(); + return IOError(filename_, EINVAL); + } else if (offset + n > length_) { + n = length_ - static_cast(offset); + } + *result = + Slice(reinterpret_cast(mapped_region_)+offset, n); + return s; +} + +Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +/////////////////////////////////////////////////////////////////////////////// +/// WinMmapFile + + +// Can only truncate or reserve to a sector size aligned if +// used on files that are opened with Unbuffered I/O +Status WinMmapFile::TruncateFile(uint64_t toSize) { + return ftruncate(filename_, hFile_, toSize); +} + +Status WinMmapFile::UnmapCurrentRegion() { + Status status; + + if (mapped_begin_ != nullptr) { + if (!::UnmapViewOfFile(mapped_begin_)) { + status = IOErrorFromWindowsError( + "Failed to unmap file view: " + filename_, GetLastError()); + } + + // Move on to the next portion of the file + file_offset_ += view_size_; + + // UnmapView automatically sends data to disk but not the metadata + // which is good and provides some equivalent of fdatasync() on Linux + // therefore, we donot need separate flag for metadata + mapped_begin_ = nullptr; + mapped_end_ = nullptr; + dst_ = nullptr; + + last_sync_ = nullptr; + pending_sync_ = false; + } + + return status; +} + +Status WinMmapFile::MapNewRegion() { + + Status status; + + assert(mapped_begin_ == nullptr); + + size_t minDiskSize = static_cast(file_offset_) + view_size_; + + if (minDiskSize > reserved_size_) { + status = Allocate(file_offset_, view_size_); + if (!status.ok()) { + return status; + } + } + + // Need to remap + if (hMap_ == NULL || reserved_size_ > mapping_size_) { + + if (hMap_ != NULL) { + // Unmap the previous one + BOOL ret __attribute__((__unused__)); + ret = ::CloseHandle(hMap_); + assert(ret); + hMap_ = NULL; + } + + ULARGE_INTEGER mappingSize; + mappingSize.QuadPart = reserved_size_; + + hMap_ = CreateFileMappingA( + hFile_, + NULL, // Security attributes + PAGE_READWRITE, // There is not a write only mode for mapping + mappingSize.HighPart, // Enable mapping the whole file but the actual + // amount mapped is determined by MapViewOfFile + mappingSize.LowPart, + NULL); // Mapping name + + if (NULL == hMap_) { + return IOErrorFromWindowsError( + "WindowsMmapFile failed to create file mapping for: " + filename_, + GetLastError()); + } + + mapping_size_ = reserved_size_; + } + + ULARGE_INTEGER offset; + offset.QuadPart = file_offset_; + + // View must begin at the granularity aligned offset + mapped_begin_ = reinterpret_cast( + MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, + view_size_, NULL)); + + if (!mapped_begin_) { + status = IOErrorFromWindowsError( + "WindowsMmapFile failed to map file view: " + filename_, + GetLastError()); + } else { + mapped_end_ = mapped_begin_ + view_size_; + dst_ = mapped_begin_; + last_sync_ = mapped_begin_; + pending_sync_ = false; + } + return status; +} + +Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(filename_, hFile_, spaceToReserve); +} + +WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, + size_t page_size, size_t allocation_granularity, + const EnvOptions& options) + : WinFileData(fname, hFile, false), + WritableFile(options), + hMap_(NULL), + page_size_(page_size), + allocation_granularity_(allocation_granularity), + reserved_size_(0), + mapping_size_(0), + view_size_(0), + mapped_begin_(nullptr), + mapped_end_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0), + pending_sync_(false) { + // Allocation granularity must be obtained from GetSystemInfo() and must be + // a power of two. + assert(allocation_granularity > 0); + assert((allocation_granularity & (allocation_granularity - 1)) == 0); + + assert(page_size > 0); + assert((page_size & (page_size - 1)) == 0); + + // Only for memory mapped writes + assert(options.use_mmap_writes); + + // View size must be both the multiple of allocation_granularity AND the + // page size and the granularity is usually a multiple of a page size. + const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode + view_size_ = Roundup(viewSize, allocation_granularity_); +} + +WinMmapFile::~WinMmapFile() { + if (hFile_) { + this->Close(); + } +} + +Status WinMmapFile::Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + + while (left > 0) { + assert(mapped_begin_ <= dst_); + size_t avail = mapped_end_ - dst_; + + if (avail == 0) { + Status s = UnmapCurrentRegion(); + if (s.ok()) { + s = MapNewRegion(); + } + + if (!s.ok()) { + return s; + } + } else { + size_t n = std::min(left, avail); + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + pending_sync_ = true; + } + } + + // Now make sure that the last partial page is padded with zeros if needed + size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_); + if (bytesToPad > 0) { + memset(dst_, 0, bytesToPad); + } + + return Status::OK(); +} + +// Means Close() will properly take care of truncate +// and it does not need any additional information +Status WinMmapFile::Truncate(uint64_t size) { + return Status::OK(); +} + +Status WinMmapFile::Close() { + Status s; + + assert(NULL != hFile_); + + // We truncate to the precise size so no + // uninitialized data at the end. SetEndOfFile + // which we use does not write zeros and it is good. + uint64_t targetSize = GetFileSize(); + + if (mapped_begin_ != nullptr) { + // Sync before unmapping to make sure everything + // is on disk and there is not a lazy writing + // so we are deterministic with the tests + Sync(); + s = UnmapCurrentRegion(); + } + + if (NULL != hMap_) { + BOOL ret = ::CloseHandle(hMap_); + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to Close mapping for file: " + filename_, lastError); + } + + hMap_ = NULL; + } + + if (hFile_ != NULL) { + + TruncateFile(targetSize); + + BOOL ret = ::CloseHandle(hFile_); + hFile_ = NULL; + + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to close file map handle: " + filename_, lastError); + } + } + + return s; +} + +Status WinMmapFile::Flush() { return Status::OK(); } + +// Flush only data +Status WinMmapFile::Sync() { + Status s; + + // Some writes occurred since last sync + if (dst_ > last_sync_) { + assert(mapped_begin_); + assert(dst_); + assert(dst_ > mapped_begin_); + assert(dst_ < mapped_end_); + + size_t page_begin = + TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); + size_t page_end = + TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); + + // Flush only the amount of that is a multiple of pages + if (!::FlushViewOfFile(mapped_begin_ + page_begin, + (page_end - page_begin) + page_size_)) { + s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, + GetLastError()); + } else { + last_sync_ = dst_; + } + } + + return s; +} + +/** +* Flush data as well as metadata to stable storage. +*/ +Status WinMmapFile::Fsync() { + Status s = Sync(); + + // Flush metadata + if (s.ok() && pending_sync_) { + if (!::FlushFileBuffers(hFile_)) { + s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, + GetLastError()); + } + pending_sync_ = false; + } + + return s; +} + +/** +* Get the size of valid data in the file. This will not match the +* size that is returned from the filesystem because we use mmap +* to extend file by map_size every time. +*/ +uint64_t WinMmapFile::GetFileSize() { + size_t used = dst_ - mapped_begin_; + return file_offset_ + used; +} + +Status WinMmapFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) { + Status status; + TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = Roundup(static_cast(offset + len), view_size_); + // Nothing to do + if (spaceToReserve <= reserved_size_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reserved_size_ = spaceToReserve; + } + return status; +} + +size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +////////////////////////////////////////////////////////////////////////////////// +// WinSequentialFile + +WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f, + const EnvOptions& options) + : WinFileData(fname, f, options.use_direct_reads) {} + +WinSequentialFile::~WinSequentialFile() { + assert(hFile_ != INVALID_HANDLE_VALUE); +} + +Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = 0; + + assert(result != nullptr); + if (WinFileData::use_direct_io()) { + return Status::NotSupported("Read() does not support direct_io"); + } + + // Windows ReadFile API accepts a DWORD. + // While it is possible to read in a loop if n is too big + // it is an unlikely case. + if (n > std::numeric_limits::max()) { + return Status::InvalidArgument("n is too big for a single ReadFile: " + + filename_); + } + + DWORD bytesToRead = static_cast(n); //cast is safe due to the check above + DWORD bytesRead = 0; + BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL); + if (ret != FALSE) { + r = bytesRead; + } else { + auto lastError = GetLastError(); + if (lastError != ERROR_HANDLE_EOF) { + s = IOErrorFromWindowsError("ReadFile failed: " + filename_, + lastError); + } + } + + *result = Slice(scratch, r); + return s; +} + +Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, size_t& bytes_read) const { + return pread(this, src, numBytes, offset, bytes_read); +} + +Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) { + + Status s; + + if (!WinFileData::use_direct_io()) { + return Status::NotSupported("This function is only used for direct_io"); + } + + if (!IsSectorAligned(static_cast(offset)) || + !IsSectorAligned(n)) { + return Status::InvalidArgument( + "WinSequentialFile::PositionedRead: offset is not properly aligned"); + } + + size_t bytes_read = 0; // out param + s = PositionedReadInternal(scratch, static_cast(n), offset, bytes_read); + *result = Slice(scratch, bytes_read); + return s; +} + + +Status WinSequentialFile::Skip(uint64_t n) { + // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit + // integer. As such it is a highly unlikley case to have n so large. + if (n > static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" + + filename_); + } + + LARGE_INTEGER li; + li.QuadPart = static_cast(n); //cast is safe due to the check above + BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT); + if (ret == FALSE) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_, + lastError); + } + return Status::OK(); +} + +Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +////////////////////////////////////////////////////////////////////////////////////////////////// +/// WinRandomAccessBase + +inline +Status WinRandomAccessImpl::PositionedReadInternal(char* src, + size_t numBytes, + uint64_t offset, + size_t& bytes_read) const { + return pread(file_base_, src, numBytes, offset, bytes_read); +} + +inline +WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base, + size_t alignment, + const EnvOptions& options) : + file_base_(file_base), + alignment_(alignment) { + + assert(!options.use_mmap_reads); +} + +inline +Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + + Status s; + + // Check buffer alignment + if (file_base_->use_direct_io()) { + if (!IsSectorAligned(static_cast(offset)) || + !IsAligned(alignment_, scratch)) { + return Status::InvalidArgument( + "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned"); + } + } + + if (n == 0) { + *result = Slice(scratch, 0); + return s; + } + + size_t bytes_read = 0; + s = PositionedReadInternal(scratch, n, offset, bytes_read); + *result = Slice(scratch, bytes_read); + return s; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/// WinRandomAccessFile + +WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile, + size_t alignment, + const EnvOptions& options) + : WinFileData(fname, hFile, options.use_direct_reads), + WinRandomAccessImpl(this, alignment, options) {} + +WinRandomAccessFile::~WinRandomAccessFile() { +} + +Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + return ReadImpl(offset, n, result, scratch); +} + +Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(GetFileHandle(), id, max_size); +} + +size_t WinRandomAccessFile::GetRequiredBufferAlignment() const { + return GetAlignment(); +} + +///////////////////////////////////////////////////////////////////////////// +// WinWritableImpl +// + +inline +Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve); +} + +inline +WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment) + : file_data_(file_data), + alignment_(alignment), + next_write_offset_(0), + reservedsize_(0) { + + // Query current position in case ReopenWritableFile is called + // This position is only important for buffered writes + // for unbuffered writes we explicitely specify the position. + LARGE_INTEGER zero_move; + zero_move.QuadPart = 0; // Do not move + LARGE_INTEGER pos; + pos.QuadPart = 0; + BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos, + FILE_CURRENT); + // Querying no supped to fail + if (ret != 0) { + next_write_offset_ = pos.QuadPart; + } else { + assert(false); + } +} + +inline +Status WinWritableImpl::AppendImpl(const Slice& data) { + + Status s; + + if (data.size() > std::numeric_limits::max()) { + return Status::InvalidArgument("data is too long for a single write" + + file_data_->GetName()); + } + + size_t bytes_written = 0; // out param + + if (file_data_->use_direct_io()) { + // With no offset specified we are appending + // to the end of the file + assert(IsSectorAligned(next_write_offset_)); + if (!IsSectorAligned(data.size()) || + !IsAligned(static_cast(GetAlignement()), data.data())) { + s = Status::InvalidArgument( + "WriteData must be page aligned, size must be sector aligned"); + } else { + s = pwrite(file_data_, data, next_write_offset_, bytes_written); + } + } else { + + DWORD bytesWritten = 0; + if (!WriteFile(file_data_->GetFileHandle(), data.data(), + static_cast(data.size()), &bytesWritten, NULL)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to WriteFile: " + file_data_->GetName(), + lastError); + } else { + bytes_written = bytesWritten; + } + } + + if(s.ok()) { + if (bytes_written == data.size()) { + // This matters for direct_io cases where + // we rely on the fact that next_write_offset_ + // is sector aligned + next_write_offset_ += bytes_written; + } else { + s = Status::IOError("Failed to write all bytes: " + + file_data_->GetName()); + } + } + + return s; +} + +inline +Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) { + + if(file_data_->use_direct_io()) { + if (!IsSectorAligned(static_cast(offset)) || + !IsSectorAligned(data.size()) || + !IsAligned(static_cast(GetAlignement()), data.data())) { + return Status::InvalidArgument( + "Data and offset must be page aligned, size must be sector aligned"); + } + } + + size_t bytes_written = 0; + Status s = pwrite(file_data_, data, offset, bytes_written); + + if(s.ok()) { + if (bytes_written == data.size()) { + // For sequential write this would be simple + // size extension by data.size() + uint64_t write_end = offset + bytes_written; + if (write_end >= next_write_offset_) { + next_write_offset_ = write_end; + } + } else { + s = Status::IOError("Failed to write all of the requested data: " + + file_data_->GetName()); + } + } + return s; +} + +inline +Status WinWritableImpl::TruncateImpl(uint64_t size) { + + // It is tempting to check for the size for sector alignment + // but truncation may come at the end and there is not a requirement + // for this to be sector aligned so long as we do not attempt to write + // after that. The interface docs state that the behavior is undefined + // in that case. + Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), + size); + + if (s.ok()) { + next_write_offset_ = size; + } + return s; +} + +inline +Status WinWritableImpl::CloseImpl() { + + Status s; + + auto hFile = file_data_->GetFileHandle(); + assert(INVALID_HANDLE_VALUE != hFile); + + if (!::FlushFileBuffers(hFile)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " + + file_data_->GetName(), + lastError); + } + + if(!file_data_->CloseFile() && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(), + lastError); + } + return s; +} + +inline +Status WinWritableImpl::SyncImpl() { + Status s; + if (!::FlushFileBuffers (file_data_->GetFileHandle())) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError); + } + return s; +} + + +inline +Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) { + Status status; + TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = Roundup(static_cast(offset + len), static_cast(alignment_)); + // Nothing to do + if (spaceToReserve <= reservedsize_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reservedsize_ = spaceToReserve; + } + return status; +} + + +//////////////////////////////////////////////////////////////////////////////// +/// WinWritableFile + +WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile, + size_t alignment, size_t /* capacity */, + const EnvOptions& options) + : WinFileData(fname, hFile, options.use_direct_writes), + WinWritableImpl(this, alignment), + WritableFile(options) { + assert(!options.use_mmap_writes); +} + +WinWritableFile::~WinWritableFile() { +} + +// Indicates if the class makes use of direct I/O +bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); } + +size_t WinWritableFile::GetRequiredBufferAlignment() const { + return static_cast(GetAlignement()); +} + +Status WinWritableFile::Append(const Slice& data) { + return AppendImpl(data); +} + +Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { + return PositionedAppendImpl(data, offset); +} + +// Need to implement this so the file is truncated correctly +// when buffered and unbuffered mode +Status WinWritableFile::Truncate(uint64_t size) { + return TruncateImpl(size); +} + +Status WinWritableFile::Close() { + return CloseImpl(); +} + + // write out the cached data to the OS cache + // This is now taken care of the WritableFileWriter +Status WinWritableFile::Flush() { + return Status::OK(); +} + +Status WinWritableFile::Sync() { + return SyncImpl(); +} + +Status WinWritableFile::Fsync() { return SyncImpl(); } + +bool WinWritableFile::IsSyncThreadSafe() const { return true; } + +uint64_t WinWritableFile::GetFileSize() { + return GetFileNextWriteOffset(); +} + +Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) { + return AllocateImpl(offset, len); +} + +size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(GetFileHandle(), id, max_size); +} + +///////////////////////////////////////////////////////////////////////// +/// WinRandomRWFile + +WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile, + size_t alignment, const EnvOptions& options) + : WinFileData(fname, hFile, + options.use_direct_reads && options.use_direct_writes), + WinRandomAccessImpl(this, alignment, options), + WinWritableImpl(this, alignment) {} + +bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); } + +size_t WinRandomRWFile::GetRequiredBufferAlignment() const { + return static_cast(GetAlignement()); +} + +Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) { + return PositionedAppendImpl(data, offset); +} + +Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + return ReadImpl(offset, n, result, scratch); +} + +Status WinRandomRWFile::Flush() { + return Status::OK(); +} + +Status WinRandomRWFile::Sync() { + return SyncImpl(); +} + +Status WinRandomRWFile::Close() { + return CloseImpl(); +} + +////////////////////////////////////////////////////////////////////////// +/// WinMemoryMappedBufer +WinMemoryMappedBuffer::~WinMemoryMappedBuffer() { + BOOL ret +#if defined(_MSC_VER) + = FALSE; +#else + __attribute__((__unused__)); +#endif + if (base_ != nullptr) { + ret = ::UnmapViewOfFile(base_); + assert(ret); + base_ = nullptr; + } + if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(map_handle_); + assert(ret); + map_handle_ = NULL; + } + if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(file_handle_); + assert(ret); + file_handle_ = NULL; + } +} + +////////////////////////////////////////////////////////////////////////// +/// WinDirectory + +Status WinDirectory::Fsync() { return Status::OK(); } + +size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(handle_, id, max_size); +} +////////////////////////////////////////////////////////////////////////// +/// WinFileLock + +WinFileLock::~WinFileLock() { + BOOL ret __attribute__((__unused__)); + ret = ::CloseHandle(hFile_); + assert(ret); +} + +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/io_win.h b/src/rocksdb/port/win/io_win.h new file mode 100644 index 000000000..4fc2ba499 --- /dev/null +++ b/src/rocksdb/port/win/io_win.h @@ -0,0 +1,456 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/env.h" +#include "util/aligned_buffer.h" + +#include + +namespace ROCKSDB_NAMESPACE { +namespace port { + +std::string GetWindowsErrSz(DWORD err); + +inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { + return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) + ? Status::NoSpace(context, GetWindowsErrSz(err)) + : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND)) + ? Status::PathNotFound(context, GetWindowsErrSz(err)) + : Status::IOError(context, GetWindowsErrSz(err)); +} + +inline Status IOErrorFromLastWindowsError(const std::string& context) { + return IOErrorFromWindowsError(context, GetLastError()); +} + +inline Status IOError(const std::string& context, int err_number) { + return (err_number == ENOSPC) + ? Status::NoSpace(context, strerror(err_number)) + : (err_number == ENOENT) + ? Status::PathNotFound(context, strerror(err_number)) + : Status::IOError(context, strerror(err_number)); +} + +class WinFileData; + +Status pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written); + +Status pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read); + +Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size); + +Status ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize); + +size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size); + +class WinFileData { + protected: + const std::string filename_; + HANDLE hFile_; + // If true, the I/O issued would be direct I/O which the buffer + // will need to be aligned (not sure there is a guarantee that the buffer + // passed in is aligned). + const bool use_direct_io_; + + public: + // We want this class be usable both for inheritance (prive + // or protected) and for containment so __ctor and __dtor public + WinFileData(const std::string& filename, HANDLE hFile, bool direct_io) + : filename_(filename), hFile_(hFile), use_direct_io_(direct_io) {} + + virtual ~WinFileData() { this->CloseFile(); } + + bool CloseFile() { + bool result = true; + + if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) { + result = ::CloseHandle(hFile_); + assert(result); + hFile_ = NULL; + } + return result; + } + + const std::string& GetName() const { return filename_; } + + HANDLE GetFileHandle() const { return hFile_; } + + bool use_direct_io() const { return use_direct_io_; } + + WinFileData(const WinFileData&) = delete; + WinFileData& operator=(const WinFileData&) = delete; +}; + +class WinSequentialFile : protected WinFileData, public SequentialFile { + + // Override for behavior change when creating a custom env + virtual Status PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, size_t& bytes_read) const; + +public: + WinSequentialFile(const std::string& fname, HANDLE f, + const EnvOptions& options); + + ~WinSequentialFile(); + + WinSequentialFile(const WinSequentialFile&) = delete; + WinSequentialFile& operator=(const WinSequentialFile&) = delete; + + virtual Status Read(size_t n, Slice* result, char* scratch) override; + virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override; + + virtual Status Skip(uint64_t n) override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); } +}; + +// mmap() based random-access +class WinMmapReadableFile : private WinFileData, public RandomAccessFile { + HANDLE hMap_; + + const void* mapped_region_; + const size_t length_; + + public: + // mapped_region_[0,length-1] contains the mmapped contents of the file. + WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap, + const void* mapped_region, size_t length); + + ~WinMmapReadableFile(); + + WinMmapReadableFile(const WinMmapReadableFile&) = delete; + WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete; + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +// We preallocate and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class WinMmapFile : private WinFileData, public WritableFile { + private: + HANDLE hMap_; + + const size_t page_size_; // We flush the mapping view in page_size + // increments. We may decide if this is a memory + // page size or SSD page size + const size_t + allocation_granularity_; // View must start at such a granularity + + size_t reserved_size_; // Preallocated size + + size_t mapping_size_; // The max size of the mapping object + // we want to guess the final file size to minimize the remapping + size_t view_size_; // How much memory to map into a view at a time + + char* mapped_begin_; // Must begin at the file offset that is aligned with + // allocation_granularity_ + char* mapped_end_; + char* dst_; // Where to write next (in range [mapped_begin_,mapped_end_]) + char* last_sync_; // Where have we synced up to + + uint64_t file_offset_; // Offset of mapped_begin_ in file + + // Do we have unsynced writes? + bool pending_sync_; + + // Can only truncate or reserve to a sector size aligned if + // used on files that are opened with Unbuffered I/O + Status TruncateFile(uint64_t toSize); + + Status UnmapCurrentRegion(); + + Status MapNewRegion(); + + virtual Status PreallocateInternal(uint64_t spaceToReserve); + + public: + WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, + size_t allocation_granularity, const EnvOptions& options); + + ~WinMmapFile(); + + WinMmapFile(const WinMmapFile&) = delete; + WinMmapFile& operator=(const WinMmapFile&) = delete; + + virtual Status Append(const Slice& data) override; + + // Means Close() will properly take care of truncate + // and it does not need any additional information + virtual Status Truncate(uint64_t size) override; + + virtual Status Close() override; + + virtual Status Flush() override; + + // Flush only data + virtual Status Sync() override; + + /** + * Flush data as well as metadata to stable storage. + */ + virtual Status Fsync() override; + + /** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + virtual uint64_t GetFileSize() override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual Status Allocate(uint64_t offset, uint64_t len) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinRandomAccessImpl { + protected: + WinFileData* file_base_; + size_t alignment_; + + // Override for behavior change when creating a custom env + virtual Status PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, size_t& bytes_read) const; + + WinRandomAccessImpl(WinFileData* file_base, size_t alignment, + const EnvOptions& options); + + virtual ~WinRandomAccessImpl() {} + + Status ReadImpl(uint64_t offset, size_t n, Slice* result, + char* scratch) const; + + size_t GetAlignment() const { return alignment_; } + + public: + + WinRandomAccessImpl(const WinRandomAccessImpl&) = delete; + WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete; +}; + +// pread() based random-access +class WinRandomAccessFile + : private WinFileData, + protected WinRandomAccessImpl, // Want to be able to override + // PositionedReadInternal + public RandomAccessFile { + public: + WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, + const EnvOptions& options); + + ~WinRandomAccessFile(); + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; + + virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); } + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetRequiredBufferAlignment() const override; +}; + +// This is a sequential write class. It has been mimicked (as others) after +// the original Posix class. We add support for unbuffered I/O on windows as +// well +// we utilize the original buffer as an alignment buffer to write directly to +// file with no buffering. +// No buffering requires that the provided buffer is aligned to the physical +// sector size (SSD page size) and +// that all SetFilePointer() operations to occur with such an alignment. +// We thus always write in sector/page size increments to the drive and leave +// the tail for the next write OR for Close() at which point we pad with zeros. +// No padding is required for +// buffered access. +class WinWritableImpl { + protected: + WinFileData* file_data_; + const uint64_t alignment_; + uint64_t next_write_offset_; // Needed because Windows does not support O_APPEND + uint64_t reservedsize_; // how far we have reserved space + + virtual Status PreallocateInternal(uint64_t spaceToReserve); + + WinWritableImpl(WinFileData* file_data, size_t alignment); + + ~WinWritableImpl() {} + + uint64_t GetAlignement() const { return alignment_; } + + Status AppendImpl(const Slice& data); + + // Requires that the data is aligned as specified by + // GetRequiredBufferAlignment() + Status PositionedAppendImpl(const Slice& data, uint64_t offset); + + Status TruncateImpl(uint64_t size); + + Status CloseImpl(); + + Status SyncImpl(); + + uint64_t GetFileNextWriteOffset() { + // Double accounting now here with WritableFileWriter + // and this size will be wrong when unbuffered access is used + // but tests implement their own writable files and do not use + // WritableFileWrapper + // so we need to squeeze a square peg through + // a round hole here. + return next_write_offset_; + } + + Status AllocateImpl(uint64_t offset, uint64_t len); + + public: + WinWritableImpl(const WinWritableImpl&) = delete; + WinWritableImpl& operator=(const WinWritableImpl&) = delete; +}; + +class WinWritableFile : private WinFileData, + protected WinWritableImpl, + public WritableFile { + public: + WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, + size_t capacity, const EnvOptions& options); + + ~WinWritableFile(); + + virtual Status Append(const Slice& data) override; + + // Requires that the data is aligned as specified by + // GetRequiredBufferAlignment() + virtual Status PositionedAppend(const Slice& data, uint64_t offset) override; + + // Need to implement this so the file is truncated correctly + // when buffered and unbuffered mode + virtual Status Truncate(uint64_t size) override; + + virtual Status Close() override; + + // write out the cached data to the OS cache + // This is now taken care of the WritableFileWriter + virtual Status Flush() override; + + virtual Status Sync() override; + + virtual Status Fsync() override; + + virtual bool IsSyncThreadSafe() const override; + + // Indicates if the class makes use of direct I/O + // Use PositionedAppend + virtual bool use_direct_io() const override; + + virtual size_t GetRequiredBufferAlignment() const override; + + virtual uint64_t GetFileSize() override; + + virtual Status Allocate(uint64_t offset, uint64_t len) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinRandomRWFile : private WinFileData, + protected WinRandomAccessImpl, + protected WinWritableImpl, + public RandomRWFile { + public: + WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment, + const EnvOptions& options); + + ~WinRandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const override; + + // Use the returned alignment value to allocate aligned + // buffer for Write() when use_direct_io() returns true + virtual size_t GetRequiredBufferAlignment() const override; + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + virtual Status Write(uint64_t offset, const Slice& data) override; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + virtual Status Flush() override; + + virtual Status Sync() override; + + virtual Status Fsync() { return Sync(); } + + virtual Status Close() override; +}; + +class WinMemoryMappedBuffer : public MemoryMappedFileBuffer { +private: + HANDLE file_handle_; + HANDLE map_handle_; +public: + WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, size_t size) : + MemoryMappedFileBuffer(base, size), + file_handle_(file_handle), + map_handle_(map_handle) {} + ~WinMemoryMappedBuffer() override; +}; + +class WinDirectory : public Directory { + HANDLE handle_; + public: + explicit WinDirectory(HANDLE h) noexcept : handle_(h) { + assert(handle_ != INVALID_HANDLE_VALUE); + } + ~WinDirectory() { + ::CloseHandle(handle_); + } + virtual Status Fsync() override; + + size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinFileLock : public FileLock { + public: + explicit WinFileLock(HANDLE hFile) : hFile_(hFile) { + assert(hFile != NULL); + assert(hFile != INVALID_HANDLE_VALUE); + } + + ~WinFileLock(); + + private: + HANDLE hFile_; +}; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/port_win.cc b/src/rocksdb/port/win/port_win.cc new file mode 100644 index 000000000..6e43b1b58 --- /dev/null +++ b/src/rocksdb/port/win/port_win.cc @@ -0,0 +1,269 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if !defined(OS_WIN) && !defined(WIN32) && !defined(_WIN32) +#error Windows Specific Code +#endif + +#include "port/win/port_win.h" + +#include +#include "port/port_dirent.h" +#include "port/sys_time.h" + +#include +#include +#include +#include + +#include +#include +#include + +#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES +// utf8 <-> utf16 +#include +#include +#include +#endif + +#include "logging/logging.h" + +namespace ROCKSDB_NAMESPACE { + +extern const bool kDefaultToAdaptiveMutex = false; + +namespace port { + +#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES +std::string utf16_to_utf8(const std::wstring& utf16) { + std::wstring_convert,wchar_t> convert; + return convert.to_bytes(utf16); +} + +std::wstring utf8_to_utf16(const std::string& utf8) { + std::wstring_convert> converter; + return converter.from_bytes(utf8); +} +#endif + +void gettimeofday(struct timeval* tv, struct timezone* /* tz */) { + using namespace std::chrono; + + microseconds usNow( + duration_cast(system_clock::now().time_since_epoch())); + + seconds secNow(duration_cast(usNow)); + + tv->tv_sec = static_cast(secNow.count()); + tv->tv_usec = static_cast(usNow.count() - + duration_cast(secNow).count()); +} + +Mutex::~Mutex() {} + +CondVar::~CondVar() {} + +void CondVar::Wait() { + // Caller must ensure that mutex is held prior to calling this method + std::unique_lock lk(mu_->getLock(), std::adopt_lock); +#ifndef NDEBUG + mu_->locked_ = false; +#endif + cv_.wait(lk); +#ifndef NDEBUG + mu_->locked_ = true; +#endif + // Release ownership of the lock as we don't want it to be unlocked when + // it goes out of scope (as we adopted the lock and didn't lock it ourselves) + lk.release(); +} + +bool CondVar::TimedWait(uint64_t abs_time_us) { + + using namespace std::chrono; + + // MSVC++ library implements wait_until in terms of wait_for so + // we need to convert absolute wait into relative wait. + microseconds usAbsTime(abs_time_us); + + microseconds usNow( + duration_cast(system_clock::now().time_since_epoch())); + microseconds relTimeUs = + (usAbsTime > usNow) ? (usAbsTime - usNow) : microseconds::zero(); + + // Caller must ensure that mutex is held prior to calling this method + std::unique_lock lk(mu_->getLock(), std::adopt_lock); +#ifndef NDEBUG + mu_->locked_ = false; +#endif + std::cv_status cvStatus = cv_.wait_for(lk, relTimeUs); +#ifndef NDEBUG + mu_->locked_ = true; +#endif + // Release ownership of the lock as we don't want it to be unlocked when + // it goes out of scope (as we adopted the lock and didn't lock it ourselves) + lk.release(); + + if (cvStatus == std::cv_status::timeout) { + return true; + } + + return false; +} + +void CondVar::Signal() { cv_.notify_one(); } + +void CondVar::SignalAll() { cv_.notify_all(); } + +int PhysicalCoreID() { return GetCurrentProcessorNumber(); } + +void InitOnce(OnceType* once, void (*initializer)()) { + std::call_once(once->flag_, initializer); +} + +// Private structure, exposed only by pointer +struct DIR { + HANDLE handle_; + bool firstread_; + RX_WIN32_FIND_DATA data_; + dirent entry_; + + DIR() : handle_(INVALID_HANDLE_VALUE), + firstread_(true) {} + + DIR(const DIR&) = delete; + DIR& operator=(const DIR&) = delete; + + ~DIR() { + if (INVALID_HANDLE_VALUE != handle_) { + ::FindClose(handle_); + } + } +}; + +DIR* opendir(const char* name) { + if (!name || *name == 0) { + errno = ENOENT; + return nullptr; + } + + std::string pattern(name); + pattern.append("\\").append("*"); + + std::unique_ptr dir(new DIR); + + dir->handle_ = RX_FindFirstFileEx(RX_FN(pattern).c_str(), + FindExInfoBasic, // Do not want alternative name + &dir->data_, + FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); + + if (dir->handle_ == INVALID_HANDLE_VALUE) { + return nullptr; + } + + RX_FILESTRING x(dir->data_.cFileName, RX_FNLEN(dir->data_.cFileName)); + strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), + FN_TO_RX(x).c_str()); + + return dir.release(); +} + +struct dirent* readdir(DIR* dirp) { + if (!dirp || dirp->handle_ == INVALID_HANDLE_VALUE) { + errno = EBADF; + return nullptr; + } + + if (dirp->firstread_) { + dirp->firstread_ = false; + return &dirp->entry_; + } + + auto ret = RX_FindNextFile(dirp->handle_, &dirp->data_); + + if (ret == 0) { + return nullptr; + } + + RX_FILESTRING x(dirp->data_.cFileName, RX_FNLEN(dirp->data_.cFileName)); + strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), + FN_TO_RX(x).c_str()); + + return &dirp->entry_; +} + +int closedir(DIR* dirp) { + delete dirp; + return 0; +} + +int truncate(const char* path, int64_t length) { + if (path == nullptr) { + errno = EFAULT; + return -1; + } + return ROCKSDB_NAMESPACE::port::Truncate(path, length); +} + +int Truncate(std::string path, int64_t len) { + + if (len < 0) { + errno = EINVAL; + return -1; + } + + HANDLE hFile = + RX_CreateFile(RX_FN(path).c_str(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, // Security attrs + OPEN_EXISTING, // Truncate existing file only + FILE_ATTRIBUTE_NORMAL, NULL); + + if (INVALID_HANDLE_VALUE == hFile) { + auto lastError = GetLastError(); + if (lastError == ERROR_FILE_NOT_FOUND) { + errno = ENOENT; + } else if (lastError == ERROR_ACCESS_DENIED) { + errno = EACCES; + } else { + errno = EIO; + } + return -1; + } + + int result = 0; + FILE_END_OF_FILE_INFO end_of_file; + end_of_file.EndOfFile.QuadPart = len; + + if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, + sizeof(FILE_END_OF_FILE_INFO))) { + errno = EIO; + result = -1; + } + + CloseHandle(hFile); + return result; +} + +void Crash(const std::string& srcfile, int srcline) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + abort(); +} + +int GetMaxOpenFiles() { return -1; } + +// Assume 4KB page size +const size_t kPageSize = 4U * 1024U; + +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/port_win.h b/src/rocksdb/port/win/port_win.h new file mode 100644 index 000000000..76fc8cf39 --- /dev/null +++ b/src/rocksdb/port/win/port_win.h @@ -0,0 +1,398 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// See port_example.h for documentation for the following types/functions. + +#pragma once + +// Always want minimum headers +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "port/win/win_thread.h" + +#include "rocksdb/options.h" + +#undef min +#undef max +#undef DeleteFile +#undef GetCurrentTime + + +#ifndef strcasecmp +#define strcasecmp _stricmp +#endif + +#undef GetCurrentTime +#undef DeleteFile + +#ifndef _SSIZE_T_DEFINED +typedef SSIZE_T ssize_t; +#endif + +// size_t printf formatting named in the manner of C99 standard formatting +// strings such as PRIu64 +// in fact, we could use that one +#ifndef ROCKSDB_PRIszt +#define ROCKSDB_PRIszt "Iu" +#endif + +#ifdef _MSC_VER +#define __attribute__(A) + +// Thread local storage on Linux +// There is thread_local in C++11 +#ifndef __thread +#define __thread __declspec(thread) +#endif + +#endif + +namespace ROCKSDB_NAMESPACE { + +#define PREFETCH(addr, rw, locality) + +extern const bool kDefaultToAdaptiveMutex; + +namespace port { + +// VS < 2015 +#if defined(_MSC_VER) && (_MSC_VER < 1900) + +// VS 15 has snprintf +#define snprintf _snprintf + +#define ROCKSDB_NOEXCEPT +// std::numeric_limits::max() is not constexpr just yet +// therefore, use the same limits + +// For use at db/file_indexer.h kLevelMaxIndex +const uint32_t kMaxUint32 = UINT32_MAX; +const int kMaxInt32 = INT32_MAX; +const int kMinInt32 = INT32_MIN; +const int64_t kMaxInt64 = INT64_MAX; +const int64_t kMinInt64 = INT64_MIN; +const uint64_t kMaxUint64 = UINT64_MAX; + +#ifdef _WIN64 +const size_t kMaxSizet = UINT64_MAX; +#else +const size_t kMaxSizet = UINT_MAX; +#endif + +#else // VS >= 2015 or MinGW + +#define ROCKSDB_NOEXCEPT noexcept + +// For use at db/file_indexer.h kLevelMaxIndex +const uint32_t kMaxUint32 = std::numeric_limits::max(); +const int kMaxInt32 = std::numeric_limits::max(); +const int kMinInt32 = std::numeric_limits::min(); +const uint64_t kMaxUint64 = std::numeric_limits::max(); +const int64_t kMaxInt64 = std::numeric_limits::max(); +const int64_t kMinInt64 = std::numeric_limits::min(); + +const size_t kMaxSizet = std::numeric_limits::max(); + +#endif //_MSC_VER + +// "Windows is designed to run on little-endian computer architectures." +// https://docs.microsoft.com/en-us/windows/win32/sysinfo/registry-value-types +constexpr bool kLittleEndian = true; +#undef PLATFORM_IS_LITTLE_ENDIAN + +class CondVar; + +class Mutex { + public: + + /* implicit */ Mutex(bool adaptive = kDefaultToAdaptiveMutex) +#ifndef NDEBUG + : locked_(false) +#endif + { } + + ~Mutex(); + + void Lock() { + mutex_.lock(); +#ifndef NDEBUG + locked_ = true; +#endif + } + + void Unlock() { +#ifndef NDEBUG + locked_ = false; +#endif + mutex_.unlock(); + } + + // this will assert if the mutex is not locked + // it does NOT verify that mutex is held by a calling thread + void AssertHeld() { +#ifndef NDEBUG + assert(locked_); +#endif + } + + // Mutex is move only with lock ownership transfer + Mutex(const Mutex&) = delete; + void operator=(const Mutex&) = delete; + + private: + + friend class CondVar; + + std::mutex& getLock() { + return mutex_; + } + + std::mutex mutex_; +#ifndef NDEBUG + bool locked_; +#endif +}; + +class RWMutex { + public: + RWMutex() { InitializeSRWLock(&srwLock_); } + // No copying allowed + RWMutex(const RWMutex&) = delete; + void operator=(const RWMutex&) = delete; + + void ReadLock() { AcquireSRWLockShared(&srwLock_); } + + void WriteLock() { AcquireSRWLockExclusive(&srwLock_); } + + void ReadUnlock() { ReleaseSRWLockShared(&srwLock_); } + + void WriteUnlock() { ReleaseSRWLockExclusive(&srwLock_); } + + // Empty as in POSIX + void AssertHeld() {} + + private: + SRWLOCK srwLock_; +}; + +class CondVar { + public: + explicit CondVar(Mutex* mu) : mu_(mu) { + } + + ~CondVar(); + void Wait(); + bool TimedWait(uint64_t expiration_time); + void Signal(); + void SignalAll(); + + // Condition var is not copy/move constructible + CondVar(const CondVar&) = delete; + CondVar& operator=(const CondVar&) = delete; + + CondVar(CondVar&&) = delete; + CondVar& operator=(CondVar&&) = delete; + + private: + std::condition_variable cv_; + Mutex* mu_; +}; + +// Wrapper around the platform efficient +// or otherwise preferrable implementation +using Thread = WindowsThread; + +// OnceInit type helps emulate +// Posix semantics with initialization +// adopted in the project +struct OnceType { + + struct Init {}; + + OnceType() {} + OnceType(const Init&) {} + OnceType(const OnceType&) = delete; + OnceType& operator=(const OnceType&) = delete; + + std::once_flag flag_; +}; + +#define LEVELDB_ONCE_INIT port::OnceType::Init() +extern void InitOnce(OnceType* once, void (*initializer)()); + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 64U +#endif + +#ifdef ROCKSDB_JEMALLOC +// Separate inlines so they can be replaced if needed +void* jemalloc_aligned_alloc(size_t size, size_t alignment) ROCKSDB_NOEXCEPT; +void jemalloc_aligned_free(void* p) ROCKSDB_NOEXCEPT; +#endif + +inline void *cacheline_aligned_alloc(size_t size) { +#ifdef ROCKSDB_JEMALLOC + return jemalloc_aligned_alloc(size, CACHE_LINE_SIZE); +#else + return _aligned_malloc(size, CACHE_LINE_SIZE); +#endif +} + +inline void cacheline_aligned_free(void *memblock) { +#ifdef ROCKSDB_JEMALLOC + jemalloc_aligned_free(memblock); +#else + _aligned_free(memblock); +#endif +} + +extern const size_t kPageSize; + +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52991 for MINGW32 +// could not be worked around with by -mno-ms-bitfields +#ifndef __MINGW32__ +#define ALIGN_AS(n) __declspec(align(n)) +#else +#define ALIGN_AS(n) +#endif + +static inline void AsmVolatilePause() { +#if defined(_M_IX86) || defined(_M_X64) + YieldProcessor(); +#endif + // it would be nice to get "wfe" on ARM here +} + +extern int PhysicalCoreID(); + +// For Thread Local Storage abstraction +typedef DWORD pthread_key_t; + +inline int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) { + // Not used + (void)destructor; + + pthread_key_t k = TlsAlloc(); + if (TLS_OUT_OF_INDEXES == k) { + return ENOMEM; + } + + *key = k; + return 0; +} + +inline int pthread_key_delete(pthread_key_t key) { + if (!TlsFree(key)) { + return EINVAL; + } + return 0; +} + +inline int pthread_setspecific(pthread_key_t key, const void* value) { + if (!TlsSetValue(key, const_cast(value))) { + return ENOMEM; + } + return 0; +} + +inline void* pthread_getspecific(pthread_key_t key) { + void* result = TlsGetValue(key); + if (!result) { + if (GetLastError() != ERROR_SUCCESS) { + errno = EINVAL; + } else { + errno = NOERROR; + } + } + return result; +} + +// UNIX equiv although errno numbers will be off +// using C-runtime to implement. Note, this does not +// feel space with zeros in case the file is extended. +int truncate(const char* path, int64_t length); +int Truncate(std::string path, int64_t length); +void Crash(const std::string& srcfile, int srcline); +extern int GetMaxOpenFiles(); +std::string utf16_to_utf8(const std::wstring& utf16); +std::wstring utf8_to_utf16(const std::string& utf8); + +} // namespace port + + +#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES + +#define RX_FILESTRING std::wstring +#define RX_FN(a) ROCKSDB_NAMESPACE::port::utf8_to_utf16(a) +#define FN_TO_RX(a) ROCKSDB_NAMESPACE::port::utf16_to_utf8(a) +#define RX_FNLEN(a) ::wcslen(a) + +#define RX_DeleteFile DeleteFileW +#define RX_CreateFile CreateFileW +#define RX_CreateFileMapping CreateFileMappingW +#define RX_GetFileAttributesEx GetFileAttributesExW +#define RX_FindFirstFileEx FindFirstFileExW +#define RX_FindNextFile FindNextFileW +#define RX_WIN32_FIND_DATA WIN32_FIND_DATAW +#define RX_CreateDirectory CreateDirectoryW +#define RX_RemoveDirectory RemoveDirectoryW +#define RX_GetFileAttributesEx GetFileAttributesExW +#define RX_MoveFileEx MoveFileExW +#define RX_CreateHardLink CreateHardLinkW +#define RX_PathIsRelative PathIsRelativeW +#define RX_GetCurrentDirectory GetCurrentDirectoryW +#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExW + +#else + +#define RX_FILESTRING std::string +#define RX_FN(a) a +#define FN_TO_RX(a) a +#define RX_FNLEN(a) strlen(a) + +#define RX_DeleteFile DeleteFileA +#define RX_CreateFile CreateFileA +#define RX_CreateFileMapping CreateFileMappingA +#define RX_GetFileAttributesEx GetFileAttributesExA +#define RX_FindFirstFileEx FindFirstFileExA +#define RX_CreateDirectory CreateDirectoryA +#define RX_FindNextFile FindNextFileA +#define RX_WIN32_FIND_DATA WIN32_FIND_DATA +#define RX_CreateDirectory CreateDirectoryA +#define RX_RemoveDirectory RemoveDirectoryA +#define RX_GetFileAttributesEx GetFileAttributesExA +#define RX_MoveFileEx MoveFileExA +#define RX_CreateHardLink CreateHardLinkA +#define RX_PathIsRelative PathIsRelativeA +#define RX_GetCurrentDirectory GetCurrentDirectoryA +#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExA + +#endif + +using port::pthread_key_t; +using port::pthread_key_create; +using port::pthread_key_delete; +using port::pthread_setspecific; +using port::pthread_getspecific; +using port::truncate; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/win_jemalloc.cc b/src/rocksdb/port/win/win_jemalloc.cc new file mode 100644 index 000000000..a29232f8a --- /dev/null +++ b/src/rocksdb/port/win/win_jemalloc.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_JEMALLOC +# error This file can only be part of jemalloc aware build +#endif + +#include +#include "jemalloc/jemalloc.h" +#include "port/win/port_win.h" + +#if defined(ZSTD) && defined(ZSTD_STATIC_LINKING_ONLY) +#include +#if (ZSTD_VERSION_NUMBER >= 500) +namespace ROCKSDB_NAMESPACE { +namespace port { +void* JemallocAllocateForZSTD(void* /* opaque */, size_t size) { + return je_malloc(size); +} +void JemallocDeallocateForZSTD(void* /* opaque */, void* address) { + je_free(address); +} +ZSTD_customMem GetJeZstdAllocationOverrides() { + return {JemallocAllocateForZSTD, JemallocDeallocateForZSTD, nullptr}; +} +} // namespace port +} // namespace ROCKSDB_NAMESPACE +#endif // (ZSTD_VERSION_NUMBER >= 500) +#endif // defined(ZSTD) defined(ZSTD_STATIC_LINKING_ONLY) + +// Global operators to be replaced by a linker when this file is +// a part of the build + +namespace ROCKSDB_NAMESPACE { +namespace port { +void* jemalloc_aligned_alloc(size_t size, size_t alignment) ROCKSDB_NOEXCEPT { + return je_aligned_alloc(alignment, size); +} +void jemalloc_aligned_free(void* p) ROCKSDB_NOEXCEPT { je_free(p); } +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +void* operator new(size_t size) { + void* p = je_malloc(size); + if (!p) { + throw std::bad_alloc(); + } + return p; +} + +void* operator new[](size_t size) { + void* p = je_malloc(size); + if (!p) { + throw std::bad_alloc(); + } + return p; +} + +void operator delete(void* p) { + if (p) { + je_free(p); + } +} + +void operator delete[](void* p) { + if (p) { + je_free(p); + } +} diff --git a/src/rocksdb/port/win/win_logger.cc b/src/rocksdb/port/win/win_logger.cc new file mode 100644 index 000000000..d429a3f4a --- /dev/null +++ b/src/rocksdb/port/win/win_logger.cc @@ -0,0 +1,192 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#include "port/win/win_logger.h" +#include "port/win/io_win.h" + +#include +#include +#include +#include +#include + +#include "rocksdb/env.h" + +#include "monitoring/iostats_context_imp.h" +#include "port/sys_time.h" + +namespace ROCKSDB_NAMESPACE { + +namespace port { + +WinLogger::WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file, + const InfoLogLevel log_level) + : Logger(log_level), + file_(file), + gettid_(gettid), + log_size_(0), + last_flush_micros_(0), + env_(env), + flush_pending_(false) { + assert(file_ != NULL); + assert(file_ != INVALID_HANDLE_VALUE); +} + +void WinLogger::DebugWriter(const char* str, int len) { + assert(file_ != INVALID_HANDLE_VALUE); + DWORD bytesWritten = 0; + BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL); + if (ret == FALSE) { + std::string errSz = GetWindowsErrSz(GetLastError()); + fprintf(stderr, errSz.c_str()); + } +} + +WinLogger::~WinLogger() { + CloseInternal(); +} + +Status WinLogger::CloseImpl() { + return CloseInternal(); +} + +Status WinLogger::CloseInternal() { + Status s; + if (INVALID_HANDLE_VALUE != file_) { + BOOL ret = FlushFileBuffers(file_); + if (ret == 0) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", + lastError); + } + ret = CloseHandle(file_); + // On error the return value is zero + if (ret == 0 && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", + lastError); + } + file_ = INVALID_HANDLE_VALUE; + closed_ = true; + } + return s; +} + +void WinLogger::Flush() { + assert(file_ != INVALID_HANDLE_VALUE); + if (flush_pending_) { + flush_pending_ = false; + // With Windows API writes go to OS buffers directly so no fflush needed + // unlike with C runtime API. We don't flush all the way to disk + // for perf reasons. + } + + last_flush_micros_ = env_->NowMicros(); +} + +void WinLogger::Logv(const char* format, va_list ap) { + IOSTATS_TIMER_GUARD(logger_nanos); + assert(file_ != INVALID_HANDLE_VALUE); + + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + std::unique_ptr largeBuffer; + for (int iter = 0; iter < 2; ++iter) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + largeBuffer.reset(new char[bufsize]); + base = largeBuffer.get(); + } + + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_s(&t, &seconds); + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + int done = vsnprintf(p, limit - p, format, backup_ap); + if (done > 0) { + p += done; + } else { + continue; + } + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + + DWORD bytesWritten = 0; + BOOL ret = WriteFile(file_, base, static_cast(write_size), + &bytesWritten, NULL); + if (ret == FALSE) { + std::string errSz = GetWindowsErrSz(GetLastError()); + fprintf(stderr, errSz.c_str()); + } + + flush_pending_ = true; + assert((bytesWritten == write_size) || (ret == FALSE)); + if (bytesWritten > 0) { + log_size_ += write_size; + } + + uint64_t now_micros = + static_cast(now_tv.tv_sec) * 1000000 + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; + // With Windows API writes go to OS buffers directly so no fflush needed + // unlike with C runtime API. We don't flush all the way to disk + // for perf reasons. + last_flush_micros_ = now_micros; + } + break; + } +} + +size_t WinLogger::GetLogFileSize() const { return log_size_; } + +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/win_logger.h b/src/rocksdb/port/win/win_logger.h new file mode 100644 index 000000000..116e7898d --- /dev/null +++ b/src/rocksdb/port/win/win_logger.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once + +#include + +#include "rocksdb/env.h" + +#include +#include + +namespace ROCKSDB_NAMESPACE { + +class Env; + +namespace port { + +class WinLogger : public ROCKSDB_NAMESPACE::Logger { + public: + WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file, + const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL); + + virtual ~WinLogger(); + + WinLogger(const WinLogger&) = delete; + + WinLogger& operator=(const WinLogger&) = delete; + + void Flush() override; + + using ROCKSDB_NAMESPACE::Logger::Logv; + void Logv(const char* format, va_list ap) override; + + size_t GetLogFileSize() const override; + + void DebugWriter(const char* str, int len); + +protected: + + Status CloseImpl() override; + + private: + HANDLE file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + std::atomic_size_t log_size_; + std::atomic_uint_fast64_t last_flush_micros_; + Env* env_; + bool flush_pending_; + + Status CloseInternal(); + + const static uint64_t flush_every_seconds_ = 5; +}; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/win_thread.cc b/src/rocksdb/port/win/win_thread.cc new file mode 100644 index 000000000..74c667b54 --- /dev/null +++ b/src/rocksdb/port/win/win_thread.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/win/win_thread.h" + +#include +#include // __beginthreadex +#include + +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { +namespace port { + +struct WindowsThread::Data { + + std::function func_; + uintptr_t handle_; + + Data(std::function&& func) : + func_(std::move(func)), + handle_(0) { + } + + Data(const Data&) = delete; + Data& operator=(const Data&) = delete; + + static unsigned int __stdcall ThreadProc(void* arg); +}; + + +void WindowsThread::Init(std::function&& func) { + + data_ = std::make_shared(std::move(func)); + // We create another instance of std::shared_ptr to get an additional ref + // since we may detach and destroy this instance before the threadproc + // may start to run. We choose to allocate this additional ref on the heap + // so we do not need to synchronize and allow this thread to proceed + std::unique_ptr> th_data(new std::shared_ptr(data_)); + + data_->handle_ = _beginthreadex(NULL, + 0, // stack size + &Data::ThreadProc, + th_data.get(), + 0, // init flag + &th_id_); + + if (data_->handle_ == 0) { + throw std::system_error(std::make_error_code( + std::errc::resource_unavailable_try_again), + "Unable to create a thread"); + } + th_data.release(); +} + +WindowsThread::WindowsThread() : + data_(nullptr), + th_id_(0) +{} + + +WindowsThread::~WindowsThread() { + // Must be joined or detached + // before destruction. + // This is the same as std::thread + if (data_) { + if (joinable()) { + assert(false); + std::terminate(); + } + data_.reset(); + } +} + +WindowsThread::WindowsThread(WindowsThread&& o) noexcept : + WindowsThread() { + *this = std::move(o); +} + +WindowsThread& WindowsThread::operator=(WindowsThread&& o) noexcept { + + if (joinable()) { + assert(false); + std::terminate(); + } + + data_ = std::move(o.data_); + + // Per spec both instances will have the same id + th_id_ = o.th_id_; + + return *this; +} + +bool WindowsThread::joinable() const { + return (data_ && data_->handle_ != 0); +} + +WindowsThread::native_handle_type WindowsThread::native_handle() const { + return reinterpret_cast(data_->handle_); +} + +unsigned WindowsThread::hardware_concurrency() { + return std::thread::hardware_concurrency(); +} + +void WindowsThread::join() { + + if (!joinable()) { + assert(false); + throw std::system_error( + std::make_error_code(std::errc::invalid_argument), + "Thread is no longer joinable"); + } + + if (GetThreadId(GetCurrentThread()) == th_id_) { + assert(false); + throw std::system_error( + std::make_error_code(std::errc::resource_deadlock_would_occur), + "Can not join itself"); + } + + auto ret = WaitForSingleObject(reinterpret_cast(data_->handle_), + INFINITE); + if (ret != WAIT_OBJECT_0) { + auto lastError = GetLastError(); + assert(false); + throw std::system_error(static_cast(lastError), + std::system_category(), + "WaitForSingleObjectFailed: thread join"); + } + + BOOL rc +#if defined(_MSC_VER) + = FALSE; +#else + __attribute__((__unused__)); +#endif + rc = CloseHandle(reinterpret_cast(data_->handle_)); + assert(rc != 0); + data_->handle_ = 0; +} + +bool WindowsThread::detach() { + + if (!joinable()) { + assert(false); + throw std::system_error( + std::make_error_code(std::errc::invalid_argument), + "Thread is no longer available"); + } + + BOOL ret = CloseHandle(reinterpret_cast(data_->handle_)); + data_->handle_ = 0; + + return (ret != 0); +} + +void WindowsThread::swap(WindowsThread& o) { + data_.swap(o.data_); + std::swap(th_id_, o.th_id_); +} + +unsigned int __stdcall WindowsThread::Data::ThreadProc(void* arg) { + auto ptr = reinterpret_cast*>(arg); + std::unique_ptr> data(ptr); + (*data)->func_(); + return 0; +} +} // namespace port +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/win/win_thread.h b/src/rocksdb/port/win/win_thread.h new file mode 100644 index 000000000..472b29955 --- /dev/null +++ b/src/rocksdb/port/win/win_thread.h @@ -0,0 +1,122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +// This class is a replacement for std::thread +// 2 reasons we do not like std::thread: +// -- is that it dynamically allocates its internals that are automatically +// freed when the thread terminates and not on the destruction of the +// object. This makes it difficult to control the source of memory +// allocation +// - This implements Pimpl so we can easily replace the guts of the +// object in our private version if necessary. +class WindowsThread { + + struct Data; + + std::shared_ptr data_; + unsigned int th_id_; + + void Init(std::function&&); + +public: + + typedef void* native_handle_type; + + // Construct with no thread + WindowsThread(); + + // Template constructor + // + // This templated constructor accomplishes several things + // + // - Allows the class as whole to be not a template + // + // - take "universal" references to support both _lvalues and _rvalues + // + // - because this constructor is a catchall case in many respects it + // may prevent us from using both the default __ctor, the move __ctor. + // Also it may circumvent copy __ctor deletion. To work around this + // we make sure this one has at least one argument and eliminate + // it from the overload selection when WindowsThread is the first + // argument. + // + // - construct with Fx(Ax...) with a variable number of types/arguments. + // + // - Gathers together the callable object with its arguments and constructs + // a single callable entity + // + // - Makes use of std::function to convert it to a specification-template + // dependent type that both checks the signature conformance to ensure + // that all of the necessary arguments are provided and allows pimpl + // implementation. + template::type, + WindowsThread>::value>::type> + explicit WindowsThread(Fn&& fx, Args&&... ax) : + WindowsThread() { + + // Use binder to create a single callable entity + auto binder = std::bind(std::forward(fx), + std::forward(ax)...); + // Use std::function to take advantage of the type erasure + // so we can still hide implementation within pimpl + // This also makes sure that the binder signature is compliant + std::function target = binder; + + Init(std::move(target)); + } + + + ~WindowsThread(); + + WindowsThread(const WindowsThread&) = delete; + + WindowsThread& operator=(const WindowsThread&) = delete; + + WindowsThread(WindowsThread&&) noexcept; + + WindowsThread& operator=(WindowsThread&&) noexcept; + + bool joinable() const; + + unsigned int get_id() const { return th_id_; } + + native_handle_type native_handle() const; + + static unsigned hardware_concurrency(); + + void join(); + + bool detach(); + + void swap(WindowsThread&); +}; +} // namespace port +} // namespace ROCKSDB_NAMESPACE + +namespace std { +inline void swap(ROCKSDB_NAMESPACE::port::WindowsThread& th1, + ROCKSDB_NAMESPACE::port::WindowsThread& th2) { + th1.swap(th2); +} +} // namespace std + diff --git a/src/rocksdb/port/win/xpress_win.cc b/src/rocksdb/port/win/xpress_win.cc new file mode 100644 index 000000000..bcc106534 --- /dev/null +++ b/src/rocksdb/port/win/xpress_win.cc @@ -0,0 +1,226 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "port/win/xpress_win.h" +#include + +#include +#include +#include +#include + +#ifdef XPRESS + +// Put this under ifdef so windows systems w/o this +// can still build +#include + +namespace ROCKSDB_NAMESPACE { +namespace port { +namespace xpress { + +// Helpers +namespace { + +auto CloseCompressorFun = [](void* h) { + if (NULL != h) { + ::CloseCompressor(reinterpret_cast(h)); + } +}; + +auto CloseDecompressorFun = [](void* h) { + if (NULL != h) { + ::CloseDecompressor(reinterpret_cast(h)); + } +}; +} + +bool Compress(const char* input, size_t length, std::string* output) { + + assert(input != nullptr); + assert(output != nullptr); + + if (length == 0) { + output->clear(); + return true; + } + + COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr; + + COMPRESSOR_HANDLE compressor = NULL; + + BOOL success = CreateCompressor( + COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &compressor); // Handle + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to create Compressor LastError: " << + GetLastError() << std::endl; +#endif + return false; + } + + std::unique_ptr + compressorGuard(compressor, CloseCompressorFun); + + SIZE_T compressedBufferSize = 0; + + // Query compressed buffer size. + success = ::Compress( + compressor, // Compressor Handle + const_cast(input), // Input buffer + length, // Uncompressed data size + NULL, // Compressed Buffer + 0, // Compressed Buffer size + &compressedBufferSize); // Compressed Data size + + if (!success) { + + auto lastError = GetLastError(); + + if (lastError != ERROR_INSUFFICIENT_BUFFER) { +#ifdef _DEBUG + std::cerr << + "XPRESS: Failed to estimate compressed buffer size LastError " << + lastError << std::endl; +#endif + return false; + } + } + + assert(compressedBufferSize > 0); + + std::string result; + result.resize(compressedBufferSize); + + SIZE_T compressedDataSize = 0; + + // Compress + success = ::Compress( + compressor, // Compressor Handle + const_cast(input), // Input buffer + length, // Uncompressed data size + &result[0], // Compressed Buffer + compressedBufferSize, // Compressed Buffer size + &compressedDataSize); // Compressed Data size + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to compress LastError " << + GetLastError() << std::endl; +#endif + return false; + } + + result.resize(compressedDataSize); + output->swap(result); + + return true; +} + +char* Decompress(const char* input_data, size_t input_length, + int* decompress_size) { + + assert(input_data != nullptr); + assert(decompress_size != nullptr); + + if (input_length == 0) { + return nullptr; + } + + COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr; + + DECOMPRESSOR_HANDLE decompressor = NULL; + + BOOL success = CreateDecompressor( + COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &decompressor); // Handle + + + if (!success) { +#ifdef _DEBUG + std::cerr << "XPRESS: Failed to create Decompressor LastError " + << GetLastError() << std::endl; +#endif + return nullptr; + } + + std::unique_ptr + compressorGuard(decompressor, CloseDecompressorFun); + + SIZE_T decompressedBufferSize = 0; + + success = ::Decompress( + decompressor, // Compressor Handle + const_cast(input_data), // Compressed data + input_length, // Compressed data size + NULL, // Buffer set to NULL + 0, // Buffer size set to 0 + &decompressedBufferSize); // Decompressed Data size + + if (!success) { + + auto lastError = GetLastError(); + + if (lastError != ERROR_INSUFFICIENT_BUFFER) { +#ifdef _DEBUG + std::cerr + << "XPRESS: Failed to estimate decompressed buffer size LastError " + << lastError << std::endl; +#endif + return nullptr; + } + } + + assert(decompressedBufferSize > 0); + + // On Windows we are limited to a 32-bit int for the + // output data size argument + // so we hopefully never get here + if (decompressedBufferSize > std::numeric_limits::max()) { + assert(false); + return nullptr; + } + + // The callers are deallocating using delete[] + // thus we must allocate with new[] + std::unique_ptr outputBuffer(new char[decompressedBufferSize]); + + SIZE_T decompressedDataSize = 0; + + success = ::Decompress( + decompressor, + const_cast(input_data), + input_length, + outputBuffer.get(), + decompressedBufferSize, + &decompressedDataSize); + + if (!success) { +#ifdef _DEBUG + std::cerr << + "XPRESS: Failed to decompress LastError " << + GetLastError() << std::endl; +#endif + return nullptr; + } + + *decompress_size = static_cast(decompressedDataSize); + + // Return the raw buffer to the caller supporting the tradition + return outputBuffer.release(); +} +} +} +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/port/win/xpress_win.h b/src/rocksdb/port/win/xpress_win.h new file mode 100644 index 000000000..1214bb7b9 --- /dev/null +++ b/src/rocksdb/port/win/xpress_win.h @@ -0,0 +1,27 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { +namespace xpress { + +bool Compress(const char* input, size_t length, std::string* output); + +char* Decompress(const char* input_data, size_t input_length, + int* decompress_size); + +} +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/port/xpress.h b/src/rocksdb/port/xpress.h new file mode 100644 index 000000000..457025f66 --- /dev/null +++ b/src/rocksdb/port/xpress.h @@ -0,0 +1,17 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +// Xpress on Windows is implemeted using Win API +#if defined(ROCKSDB_PLATFORM_POSIX) +#error "Xpress compression not implemented" +#elif defined(OS_WIN) +#include "port/win/xpress_win.h" +#endif -- cgit v1.2.3