summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/port
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/port
parentInitial commit. (diff)
downloadceph-upstream/18.2.2.tar.xz
ceph-upstream/18.2.2.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/port')
-rw-r--r--src/rocksdb/port/README10
-rw-r--r--src/rocksdb/port/jemalloc_helper.h107
-rw-r--r--src/rocksdb/port/lang.h70
-rw-r--r--src/rocksdb/port/likely.h18
-rw-r--r--src/rocksdb/port/malloc.h17
-rw-r--r--src/rocksdb/port/port.h21
-rw-r--r--src/rocksdb/port/port_dirent.h44
-rw-r--r--src/rocksdb/port/port_example.h101
-rw-r--r--src/rocksdb/port/port_posix.cc300
-rw-r--r--src/rocksdb/port/port_posix.h241
-rw-r--r--src/rocksdb/port/stack_trace.cc202
-rw-r--r--src/rocksdb/port/stack_trace.h31
-rw-r--r--src/rocksdb/port/sys_time.h63
-rw-r--r--src/rocksdb/port/util_logger.h18
-rw-r--r--src/rocksdb/port/win/env_default.cc45
-rw-r--r--src/rocksdb/port/win/env_win.cc1437
-rw-r--r--src/rocksdb/port/win/env_win.h304
-rw-r--r--src/rocksdb/port/win/io_win.cc1101
-rw-r--r--src/rocksdb/port/win/io_win.h508
-rw-r--r--src/rocksdb/port/win/port_win.cc303
-rw-r--r--src/rocksdb/port/win/port_win.h378
-rw-r--r--src/rocksdb/port/win/win_jemalloc.cc80
-rw-r--r--src/rocksdb/port/win/win_logger.cc192
-rw-r--r--src/rocksdb/port/win/win_logger.h64
-rw-r--r--src/rocksdb/port/win/win_thread.cc170
-rw-r--r--src/rocksdb/port/win/win_thread.h117
-rw-r--r--src/rocksdb/port/win/xpress_win.cc210
-rw-r--r--src/rocksdb/port/win/xpress_win.h26
-rw-r--r--src/rocksdb/port/xpress.h17
29 files changed, 6195 insertions, 0 deletions
diff --git a/src/rocksdb/port/README b/src/rocksdb/port/README
new file mode 100644
index 000000000..422563e25
--- /dev/null
+++ b/src/rocksdb/port/README
@@ -0,0 +1,10 @@
+This directory contains interfaces and implementations that isolate the
+rest of the package from platform details.
+
+Code in the rest of the package includes "port.h" from this directory.
+"port.h" in turn includes a platform specific "port_<platform>.h" file
+that provides the platform specific implementation.
+
+See port_posix.h for an example of what must be provided in a platform
+specific header file.
+
diff --git a/src/rocksdb/port/jemalloc_helper.h b/src/rocksdb/port/jemalloc_helper.h
new file mode 100644
index 000000000..f085f6226
--- /dev/null
+++ b/src/rocksdb/port/jemalloc_helper.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if defined(__clang__) && defined(__GLIBC__)
+// glibc's `posix_memalign()` declaration specifies `throw()` while clang's
+// declaration does not. There is a hack in clang to make its re-declaration
+// compatible with glibc's if they are declared consecutively. That hack breaks
+// if yet another `posix_memalign()` declaration comes between glibc's and
+// clang's declarations. Include "mm_malloc.h" here ensures glibc's and clang's
+// declarations both come before "jemalloc.h"'s `posix_memalign()` declaration.
+//
+// This problem could also be avoided if "jemalloc.h"'s `posix_memalign()`
+// declaration did not specify `throw()` when built with clang.
+#include <mm_malloc.h>
+#endif
+
+#ifdef ROCKSDB_JEMALLOC
+#ifdef __FreeBSD__
+#include <malloc_np.h>
+#define JEMALLOC_USABLE_SIZE_CONST const
+#else
+#define JEMALLOC_MANGLE
+#include <jemalloc/jemalloc.h>
+#endif
+
+#ifndef JEMALLOC_CXX_THROW
+#define JEMALLOC_CXX_THROW
+#endif
+
+#if defined(OS_WIN) && defined(_MSC_VER)
+
+// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is
+// defined, Jemalloc memory allocator is used.
+static inline bool HasJemalloc() { return true; }
+
+#else
+
+// definitions for compatibility with older versions of jemalloc
+#if !defined(JEMALLOC_ALLOCATOR)
+#define JEMALLOC_ALLOCATOR
+#endif
+#if !defined(JEMALLOC_RESTRICT_RETURN)
+#define JEMALLOC_RESTRICT_RETURN
+#endif
+#if !defined(JEMALLOC_NOTHROW)
+#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
+#endif
+#if !defined(JEMALLOC_ALLOC_SIZE)
+#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
+#else
+#define JEMALLOC_ALLOC_SIZE(s)
+#endif
+#endif
+
+// Declare non-standard jemalloc APIs as weak symbols. We can null-check these
+// symbols to detect whether jemalloc is linked with the binary.
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
+ __attribute__((__weak__));
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int)
+ __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int)
+ JEMALLOC_ATTR(pure) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int)
+ __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure)
+ __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *,
+ size_t) __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *,
+ size_t *)
+ __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *,
+ size_t *, void *, size_t)
+ __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW
+malloc_stats_print(void (*)(void *, const char *), void *, const char *)
+ __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW
+malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW
+ __attribute__((__weak__));
+
+// Check if Jemalloc is linked with the binary. Note the main program might be
+// using a different memory allocator even this method return true.
+// It is loosely based on folly::usingJEMalloc(), minus the check that actually
+// allocate memory and see if it is through jemalloc, to handle the dlopen()
+// case:
+// https://github.com/facebook/folly/blob/76cf8b5841fb33137cfbf8b224f0226437c855bc/folly/memory/Malloc.h#L147
+static inline bool HasJemalloc() {
+ return mallocx != nullptr && rallocx != nullptr && xallocx != nullptr &&
+ sallocx != nullptr && dallocx != nullptr && sdallocx != nullptr &&
+ nallocx != nullptr && mallctl != nullptr &&
+ mallctlnametomib != nullptr && mallctlbymib != nullptr &&
+ malloc_stats_print != nullptr && malloc_usable_size != nullptr;
+}
+
+#endif
+
+#endif // ROCKSDB_JEMALLOC
diff --git a/src/rocksdb/port/lang.h b/src/rocksdb/port/lang.h
new file mode 100644
index 000000000..52c597acd
--- /dev/null
+++ b/src/rocksdb/port/lang.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef FALLTHROUGH_INTENDED
+#if defined(__clang__)
+#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
+#else
+#define FALLTHROUGH_INTENDED \
+ do { \
+ } while (0)
+#endif
+#endif
+
+#define DECLARE_DEFAULT_MOVES(Name) \
+ Name(Name&&) noexcept = default; \
+ Name& operator=(Name&&) = default
+
+// ASAN (Address sanitizer)
+
+#if defined(__clang__)
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif // __has_feature(address_sanitizer)
+#endif // defined(__has_feature)
+#else // __clang__
+#ifdef __SANITIZE_ADDRESS__
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif // __SANITIZE_ADDRESS__
+#endif // __clang__
+
+#ifdef ROCKSDB_VALGRIND_RUN
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif // ROCKSDB_VALGRIND_RUN
+
+// Coding guidelines say to avoid static objects with non-trivial destructors,
+// because it's easy to cause trouble (UB) in static destruction. This
+// macro makes it easier to define static objects that are normally never
+// destructed, except are destructed when running under ASAN. This should
+// avoid unexpected, unnecessary destruction behavior in production.
+// Note that constructor arguments can be provided as in
+// STATIC_AVOID_DESTRUCTION(Foo, foo)(arg1, arg2);
+#ifdef MUST_FREE_HEAP_ALLOCATIONS
+#define STATIC_AVOID_DESTRUCTION(Type, name) static Type name
+constexpr bool kMustFreeHeapAllocations = true;
+#else
+#define STATIC_AVOID_DESTRUCTION(Type, name) static Type& name = *new Type
+constexpr bool kMustFreeHeapAllocations = false;
+#endif
+
+// TSAN (Thread sanitizer)
+
+// For simplicity, standardize on the GCC define
+#if defined(__clang__)
+#if defined(__has_feature) && __has_feature(thread_sanitizer)
+#define __SANITIZE_THREAD__ 1
+#endif // __has_feature(thread_sanitizer)
+#endif // __clang__
+
+#ifdef __SANITIZE_THREAD__
+#define TSAN_SUPPRESSION __attribute__((no_sanitize("thread")))
+#else
+#define TSAN_SUPPRESSION
+#endif // TSAN_SUPPRESSION
diff --git a/src/rocksdb/port/likely.h b/src/rocksdb/port/likely.h
new file mode 100644
index 000000000..0bd90d701
--- /dev/null
+++ b/src/rocksdb/port/likely.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define LIKELY(x) (__builtin_expect((x), 1))
+#define UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
diff --git a/src/rocksdb/port/malloc.h b/src/rocksdb/port/malloc.h
new file mode 100644
index 000000000..f973263e2
--- /dev/null
+++ b/src/rocksdb/port/malloc.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else
+#include <malloc.h>
+#endif // OS_FREEBSD
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
diff --git a/src/rocksdb/port/port.h b/src/rocksdb/port/port.h
new file mode 100644
index 000000000..13aa56d47
--- /dev/null
+++ b/src/rocksdb/port/port.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+
+// Include the appropriate platform specific file below. If you are
+// porting to a new platform, see "port_example.h" for documentation
+// of what the new port_<platform>.h file must provide.
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#include "port/port_posix.h"
+#elif defined(OS_WIN)
+#include "port/win/port_win.h"
+#endif
diff --git a/src/rocksdb/port/port_dirent.h b/src/rocksdb/port/port_dirent.h
new file mode 100644
index 000000000..2b23e2f07
--- /dev/null
+++ b/src/rocksdb/port/port_dirent.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#pragma once
+
+#ifdef ROCKSDB_PLATFORM_POSIX
+#include <dirent.h>
+#include <sys/types.h>
+#elif defined(OS_WIN)
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+struct dirent {
+ char d_name[_MAX_PATH]; /* filename */
+};
+
+struct DIR;
+
+DIR* opendir(const char* name);
+
+dirent* readdir(DIR* dirp);
+
+int closedir(DIR* dirp);
+
+} // namespace port
+
+using port::closedir;
+using port::DIR;
+using port::dirent;
+using port::opendir;
+using port::readdir;
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // OS_WIN
diff --git a/src/rocksdb/port/port_example.h b/src/rocksdb/port/port_example.h
new file mode 100644
index 000000000..794149a69
--- /dev/null
+++ b/src/rocksdb/port/port_example.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// This file contains the specification, but not the implementations,
+// of the types/operations/etc. that should be defined by a platform
+// specific port_<platform>.h file. Use this file as a reference for
+// how to port this package to a new platform.
+
+#pragma once
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// TODO(jorlow): Many of these belong more in the environment class rather than
+// here. We should try moving them and see if it affects perf.
+
+// The following boolean constant must be true on a little-endian machine
+// and false otherwise.
+static const bool kLittleEndian = true /* or some other expression */;
+
+// ------------------ Threading -------------------
+
+// A Mutex represents an exclusive lock.
+class Mutex {
+ public:
+ Mutex();
+ ~Mutex();
+
+ // Lock the mutex. Waits until other lockers have exited.
+ // Will deadlock if the mutex is already locked by this thread.
+ void Lock();
+
+ // Unlock the mutex.
+ // REQUIRES: This mutex was locked by this thread.
+ void Unlock();
+
+ // Optionally crash if this thread does not hold this mutex.
+ // The implementation must be fast, especially if NDEBUG is
+ // defined. The implementation is allowed to skip all checks.
+ void AssertHeld();
+};
+
+class CondVar {
+ public:
+ explicit CondVar(Mutex* mu);
+ ~CondVar();
+
+ // Atomically release *mu and block on this condition variable until
+ // either a call to SignalAll(), or a call to Signal() that picks
+ // this thread to wakeup.
+ // REQUIRES: this thread holds *mu
+ void Wait();
+
+ // If there are some threads waiting, wake up at least one of them.
+ void Signal();
+
+ // Wake up all waiting threads.
+ void SignallAll();
+};
+
+// Thread-safe initialization.
+// Used as follows:
+// static port::OnceType init_control = LEVELDB_ONCE_INIT;
+// static void Initializer() { ... do something ...; }
+// ...
+// port::InitOnce(&init_control, &Initializer);
+using OnceType = intptr_t;
+#define LEVELDB_ONCE_INIT 0
+extern void InitOnce(port::OnceType*, void (*initializer)());
+
+// ------------------ Compression -------------------
+
+// Store the snappy compression of "input[0,input_length-1]" in *output.
+// Returns false if snappy is not supported by this port.
+extern bool Snappy_Compress(const char* input, size_t input_length,
+ std::string* output);
+
+// If input[0,input_length-1] looks like a valid snappy compressed
+// buffer, store the size of the uncompressed data in *result and
+// return true. Else return false.
+extern bool Snappy_GetUncompressedLength(const char* input, size_t length,
+ size_t* result);
+
+// Attempt to snappy uncompress input[0,input_length-1] into *output.
+// Returns true if successful, false if the input is invalid lightweight
+// compressed data.
+//
+// REQUIRES: at least the first "n" bytes of output[] must be writable
+// where "n" is the result of a successful call to
+// Snappy_GetUncompressedLength.
+extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
+ char* output);
+
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/port_posix.cc b/src/rocksdb/port/port_posix.cc
new file mode 100644
index 000000000..3872293b8
--- /dev/null
+++ b/src/rocksdb/port/port_posix.cc
@@ -0,0 +1,300 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if !defined(OS_WIN)
+
+#include "port/port_posix.h"
+
+#include <assert.h>
+#if defined(__i386__) || defined(__x86_64__)
+#include <cpuid.h>
+#endif
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// We want to give users opportunity to default all the mutexes to adaptive if
+// not specified otherwise. This enables a quick way to conduct various
+// performance related experiements.
+//
+// NB! Support for adaptive mutexes is turned on by definining
+// ROCKSDB_PTHREAD_ADAPTIVE_MUTEX during the compilation. If you use RocksDB
+// build environment then this happens automatically; otherwise it's up to the
+// consumer to define the identifier.
+#ifdef ROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX
+extern const bool kDefaultToAdaptiveMutex = true;
+#else
+extern const bool kDefaultToAdaptiveMutex = false;
+#endif
+
+namespace port {
+
+static int PthreadCall(const char* label, int result) {
+ if (result != 0 && result != ETIMEDOUT && result != EBUSY) {
+ fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str());
+ abort();
+ }
+ return result;
+}
+
+Mutex::Mutex(bool adaptive) {
+ (void)adaptive;
+#ifdef ROCKSDB_PTHREAD_ADAPTIVE_MUTEX
+ if (!adaptive) {
+ PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr));
+ } else {
+ pthread_mutexattr_t mutex_attr;
+ PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr));
+ PthreadCall("set mutex attr", pthread_mutexattr_settype(
+ &mutex_attr, PTHREAD_MUTEX_ADAPTIVE_NP));
+ PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr));
+ PthreadCall("destroy mutex attr", pthread_mutexattr_destroy(&mutex_attr));
+ }
+#else
+ PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr));
+#endif // ROCKSDB_PTHREAD_ADAPTIVE_MUTEX
+}
+
+Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
+
+void Mutex::Lock() {
+ PthreadCall("lock", pthread_mutex_lock(&mu_));
+#ifndef NDEBUG
+ locked_ = true;
+#endif
+}
+
+void Mutex::Unlock() {
+#ifndef NDEBUG
+ locked_ = false;
+#endif
+ PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+}
+
+bool Mutex::TryLock() {
+ bool ret = PthreadCall("trylock", pthread_mutex_trylock(&mu_)) == 0;
+#ifndef NDEBUG
+ if (ret) {
+ locked_ = true;
+ }
+#endif
+ return ret;
+}
+
+void Mutex::AssertHeld() {
+#ifndef NDEBUG
+ assert(locked_);
+#endif
+}
+
+CondVar::CondVar(Mutex* mu) : mu_(mu) {
+ PthreadCall("init cv", pthread_cond_init(&cv_, nullptr));
+}
+
+CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); }
+
+void CondVar::Wait() {
+#ifndef NDEBUG
+ mu_->locked_ = false;
+#endif
+ PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
+#ifndef NDEBUG
+ mu_->locked_ = true;
+#endif
+}
+
+bool CondVar::TimedWait(uint64_t abs_time_us) {
+ struct timespec ts;
+ ts.tv_sec = static_cast<time_t>(abs_time_us / 1000000);
+ ts.tv_nsec = static_cast<suseconds_t>((abs_time_us % 1000000) * 1000);
+
+#ifndef NDEBUG
+ mu_->locked_ = false;
+#endif
+ int err = pthread_cond_timedwait(&cv_, &mu_->mu_, &ts);
+#ifndef NDEBUG
+ mu_->locked_ = true;
+#endif
+ if (err == ETIMEDOUT) {
+ return true;
+ }
+ if (err != 0) {
+ PthreadCall("timedwait", err);
+ }
+ return false;
+}
+
+void CondVar::Signal() { PthreadCall("signal", pthread_cond_signal(&cv_)); }
+
+void CondVar::SignalAll() {
+ PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
+}
+
+RWMutex::RWMutex() {
+ PthreadCall("init mutex", pthread_rwlock_init(&mu_, nullptr));
+}
+
+RWMutex::~RWMutex() {
+ PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_));
+}
+
+void RWMutex::ReadLock() {
+ PthreadCall("read lock", pthread_rwlock_rdlock(&mu_));
+}
+
+void RWMutex::WriteLock() {
+ PthreadCall("write lock", pthread_rwlock_wrlock(&mu_));
+}
+
+void RWMutex::ReadUnlock() {
+ PthreadCall("read unlock", pthread_rwlock_unlock(&mu_));
+}
+
+void RWMutex::WriteUnlock() {
+ PthreadCall("write unlock", pthread_rwlock_unlock(&mu_));
+}
+
+int PhysicalCoreID() {
+#if defined(ROCKSDB_SCHED_GETCPU_PRESENT) && defined(__x86_64__) && \
+ (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 22))
+ // sched_getcpu uses VDSO getcpu() syscall since 2.22. I believe Linux offers
+ // VDSO support only on x86_64. This is the fastest/preferred method if
+ // available.
+ int cpuno = sched_getcpu();
+ if (cpuno < 0) {
+ return -1;
+ }
+ return cpuno;
+#elif defined(__x86_64__) || defined(__i386__)
+ // clang/gcc both provide cpuid.h, which defines __get_cpuid(), for x86_64 and
+ // i386.
+ unsigned eax, ebx = 0, ecx, edx;
+ if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+ return -1;
+ }
+ return ebx >> 24;
+#else
+ // give up, the caller can generate a random number or something.
+ return -1;
+#endif
+}
+
+void InitOnce(OnceType* once, void (*initializer)()) {
+ PthreadCall("once", pthread_once(once, initializer));
+}
+
+void Crash(const std::string& srcfile, int srcline) {
+ fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
+ fflush(stdout);
+ kill(getpid(), SIGTERM);
+}
+
+int GetMaxOpenFiles() {
+#if defined(RLIMIT_NOFILE)
+ struct rlimit no_files_limit;
+ if (getrlimit(RLIMIT_NOFILE, &no_files_limit) != 0) {
+ return -1;
+ }
+ // protect against overflow
+ if (static_cast<uintmax_t>(no_files_limit.rlim_cur) >=
+ static_cast<uintmax_t>(std::numeric_limits<int>::max())) {
+ return std::numeric_limits<int>::max();
+ }
+ return static_cast<int>(no_files_limit.rlim_cur);
+#endif
+ return -1;
+}
+
+void* cacheline_aligned_alloc(size_t size) {
+#if __GNUC__ < 5 && defined(__SANITIZE_ADDRESS__)
+ return malloc(size);
+#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__))
+ void* m;
+ errno = posix_memalign(&m, CACHE_LINE_SIZE, size);
+ return errno ? nullptr : m;
+#else
+ return malloc(size);
+#endif
+}
+
+void cacheline_aligned_free(void* memblock) { free(memblock); }
+
+static size_t GetPageSize() {
+#if defined(OS_LINUX) || defined(_SC_PAGESIZE)
+ long v = sysconf(_SC_PAGESIZE);
+ if (v >= 1024) {
+ return static_cast<size_t>(v);
+ }
+#endif
+ // Default assume 4KB
+ return 4U * 1024U;
+}
+
+const size_t kPageSize = GetPageSize();
+
+void SetCpuPriority(ThreadId id, CpuPriority priority) {
+#ifdef OS_LINUX
+ sched_param param;
+ param.sched_priority = 0;
+ switch (priority) {
+ case CpuPriority::kHigh:
+ sched_setscheduler(id, SCHED_OTHER, &param);
+ setpriority(PRIO_PROCESS, id, -20);
+ break;
+ case CpuPriority::kNormal:
+ sched_setscheduler(id, SCHED_OTHER, &param);
+ setpriority(PRIO_PROCESS, id, 0);
+ break;
+ case CpuPriority::kLow:
+ sched_setscheduler(id, SCHED_OTHER, &param);
+ setpriority(PRIO_PROCESS, id, 19);
+ break;
+ case CpuPriority::kIdle:
+ sched_setscheduler(id, SCHED_IDLE, &param);
+ break;
+ default:
+ assert(false);
+ }
+#else
+ (void)id;
+ (void)priority;
+#endif
+}
+
+int64_t GetProcessID() { return getpid(); }
+
+bool GenerateRfcUuid(std::string* output) {
+ output->clear();
+ std::ifstream f("/proc/sys/kernel/random/uuid");
+ std::getline(f, /*&*/ *output);
+ if (output->size() == 36) {
+ return true;
+ } else {
+ output->clear();
+ return false;
+ }
+}
+
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/port_posix.h b/src/rocksdb/port/port_posix.h
new file mode 100644
index 000000000..ec6aa281d
--- /dev/null
+++ b/src/rocksdb/port/port_posix.h
@@ -0,0 +1,241 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#pragma once
+
+#include <thread>
+
+#include "rocksdb/options.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+// size_t printf formatting named in the manner of C99 standard formatting
+// strings such as PRIu64
+// in fact, we could use that one
+#define ROCKSDB_PRIszt "zu"
+
+#define __declspec(S)
+
+#undef PLATFORM_IS_LITTLE_ENDIAN
+#if defined(OS_MACOSX)
+#include <machine/endian.h>
+#if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER)
+#define PLATFORM_IS_LITTLE_ENDIAN \
+ (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN)
+#endif
+#elif defined(OS_SOLARIS)
+#include <sys/isa_defs.h>
+#ifdef _LITTLE_ENDIAN
+#define PLATFORM_IS_LITTLE_ENDIAN true
+#else
+#define PLATFORM_IS_LITTLE_ENDIAN false
+#endif
+#include <alloca.h>
+#elif defined(OS_AIX)
+#include <arpa/nameser_compat.h>
+#include <sys/types.h>
+#define PLATFORM_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN)
+#include <alloca.h>
+#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || \
+ defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID)
+#include <sys/endian.h>
+#include <sys/types.h>
+#define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN)
+#else
+#include <endian.h>
+#endif
+#include <pthread.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <limits>
+#include <string>
+
+#ifndef PLATFORM_IS_LITTLE_ENDIAN
+#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN)
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) || \
+ defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) || \
+ defined(OS_ANDROID) || defined(CYGWIN) || defined(OS_AIX)
+// Use fread/fwrite/fflush on platforms without _unlocked variants
+#define fread_unlocked fread
+#define fwrite_unlocked fwrite
+#define fflush_unlocked fflush
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || \
+ defined(OS_DRAGONFLYBSD)
+// Use fsync() on platforms without fdatasync()
+#define fdatasync fsync
+#endif
+
+#if defined(OS_ANDROID) && __ANDROID_API__ < 9
+// fdatasync() was only introduced in API level 9 on Android. Use fsync()
+// when targeting older platforms.
+#define fdatasync fsync
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const bool kDefaultToAdaptiveMutex;
+
+namespace port {
+constexpr bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN;
+#undef PLATFORM_IS_LITTLE_ENDIAN
+
+class CondVar;
+
+class Mutex {
+ public:
+ static const char* kName() { return "pthread_mutex_t"; }
+
+ explicit Mutex(bool adaptive = kDefaultToAdaptiveMutex);
+ // No copying
+ Mutex(const Mutex&) = delete;
+ void operator=(const Mutex&) = delete;
+
+ ~Mutex();
+
+ void Lock();
+ void Unlock();
+
+ bool TryLock();
+
+ // this will assert if the mutex is not locked
+ // it does NOT verify that mutex is held by a calling thread
+ void AssertHeld();
+
+ // Also implement std Lockable
+ inline void lock() { Lock(); }
+ inline void unlock() { Unlock(); }
+ inline bool try_lock() { return TryLock(); }
+
+ private:
+ friend class CondVar;
+ pthread_mutex_t mu_;
+#ifndef NDEBUG
+ bool locked_ = false;
+#endif
+};
+
+class RWMutex {
+ public:
+ RWMutex();
+ // No copying allowed
+ RWMutex(const RWMutex&) = delete;
+ void operator=(const RWMutex&) = delete;
+
+ ~RWMutex();
+
+ void ReadLock();
+ void WriteLock();
+ void ReadUnlock();
+ void WriteUnlock();
+ void AssertHeld() {}
+
+ private:
+ pthread_rwlock_t mu_; // the underlying platform mutex
+};
+
+class CondVar {
+ public:
+ explicit CondVar(Mutex* mu);
+ ~CondVar();
+ void Wait();
+ // Timed condition wait. Returns true if timeout occurred.
+ bool TimedWait(uint64_t abs_time_us);
+ void Signal();
+ void SignalAll();
+
+ private:
+ pthread_cond_t cv_;
+ Mutex* mu_;
+};
+
+using Thread = std::thread;
+
+static inline void AsmVolatilePause() {
+#if defined(__i386__) || defined(__x86_64__)
+ asm volatile("pause");
+#elif defined(__aarch64__)
+ asm volatile("isb");
+#elif defined(__powerpc64__)
+ asm volatile("or 27,27,27");
+#endif
+ // it's okay for other platforms to be no-ops
+}
+
+// Returns -1 if not available on this platform
+extern int PhysicalCoreID();
+
+using OnceType = pthread_once_t;
+#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT
+extern void InitOnce(OnceType* once, void (*initializer)());
+
+#ifndef CACHE_LINE_SIZE
+// To test behavior with non-native cache line size, e.g. for
+// Bloom filters, set TEST_CACHE_LINE_SIZE to the desired test size.
+// This disables ALIGN_AS to keep it from failing compilation.
+#ifdef TEST_CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE TEST_CACHE_LINE_SIZE
+#define ALIGN_AS(n) /*empty*/
+#else
+#if defined(__s390__)
+#if defined(__GNUC__) && __GNUC__ < 7
+#define CACHE_LINE_SIZE 64U
+#else
+#define CACHE_LINE_SIZE 256U
+#endif
+#elif defined(__powerpc__) || defined(__aarch64__)
+#define CACHE_LINE_SIZE 128U
+#else
+#define CACHE_LINE_SIZE 64U
+#endif
+#define ALIGN_AS(n) alignas(n)
+#endif
+#endif
+
+static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0,
+ "Cache line size must be a power of 2 number of bytes");
+
+extern void* cacheline_aligned_alloc(size_t size);
+
+extern void cacheline_aligned_free(void* memblock);
+
+#if defined(__aarch64__)
+// __builtin_prefetch(..., 1) turns into a prefetch into prfm pldl3keep. On
+// arm64 we want this as close to the core as possible to turn it into a
+// L1 prefetech unless locality == 0 in which case it will be turned into a
+// non-temporal prefetch
+#define PREFETCH(addr, rw, locality) \
+ __builtin_prefetch(addr, rw, locality >= 1 ? 3 : locality)
+#else
+#define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
+#endif
+
+extern void Crash(const std::string& srcfile, int srcline);
+
+extern int GetMaxOpenFiles();
+
+extern const size_t kPageSize;
+
+using ThreadId = pid_t;
+
+extern void SetCpuPriority(ThreadId id, CpuPriority priority);
+
+int64_t GetProcessID();
+
+// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns
+// true on success or false on failure.
+bool GenerateRfcUuid(std::string* output);
+
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/stack_trace.cc b/src/rocksdb/port/stack_trace.cc
new file mode 100644
index 000000000..ef7144947
--- /dev/null
+++ b/src/rocksdb/port/stack_trace.cc
@@ -0,0 +1,202 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#include "port/stack_trace.h"
+
+#if defined(ROCKSDB_LITE) || \
+ !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \
+ defined(OS_SOLARIS) || defined(OS_WIN)
+
+// noop
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+void InstallStackTraceHandler() {}
+void PrintStack(int /*first_frames_to_skip*/) {}
+void PrintAndFreeStack(void* /*callstack*/, int /*num_frames*/) {}
+void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) {
+ return nullptr;
+}
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+#else
+
+#include <cxxabi.h>
+#include <execinfo.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#if defined(OS_FREEBSD)
+#include <sys/sysctl.h>
+#endif
+#ifdef OS_LINUX
+#include <sys/prctl.h>
+#endif
+
+#include "port/lang.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+namespace {
+
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD)
+const char* GetExecutableName() {
+ static char name[1024];
+
+#if !defined(OS_FREEBSD)
+ char link[1024];
+ snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
+ auto read = readlink(link, name, sizeof(name) - 1);
+ if (-1 == read) {
+ return nullptr;
+ } else {
+ name[read] = 0;
+ return name;
+ }
+#else
+ int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
+ size_t namesz = sizeof(name);
+
+ auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0);
+ if (-1 == ret) {
+ return nullptr;
+ } else {
+ return name;
+ }
+#endif
+}
+
+void PrintStackTraceLine(const char* symbol, void* frame) {
+ static const char* executable = GetExecutableName();
+ if (symbol) {
+ fprintf(stderr, "%s ", symbol);
+ }
+ if (executable) {
+ // out source to addr2line, for the address translation
+ const int kLineMax = 256;
+ char cmd[kLineMax];
+ snprintf(cmd, kLineMax, "addr2line %p -e %s -f -C 2>&1", frame, executable);
+ auto f = popen(cmd, "r");
+ if (f) {
+ char line[kLineMax];
+ while (fgets(line, sizeof(line), f)) {
+ line[strlen(line) - 1] = 0; // remove newline
+ fprintf(stderr, "%s\t", line);
+ }
+ pclose(f);
+ }
+ } else {
+ fprintf(stderr, " %p", frame);
+ }
+
+ fprintf(stderr, "\n");
+}
+#elif defined(OS_MACOSX)
+
+void PrintStackTraceLine(const char* symbol, void* frame) {
+ static int pid = getpid();
+ // out source to atos, for the address translation
+ const int kLineMax = 256;
+ char cmd[kLineMax];
+ snprintf(cmd, kLineMax, "xcrun atos %p -p %d 2>&1", frame, pid);
+ auto f = popen(cmd, "r");
+ if (f) {
+ char line[kLineMax];
+ while (fgets(line, sizeof(line), f)) {
+ line[strlen(line) - 1] = 0; // remove newline
+ fprintf(stderr, "%s\t", line);
+ }
+ pclose(f);
+ } else if (symbol) {
+ fprintf(stderr, "%s ", symbol);
+ }
+
+ fprintf(stderr, "\n");
+}
+
+#endif
+
+} // namespace
+
+void PrintStack(void* frames[], int num_frames) {
+ auto symbols = backtrace_symbols(frames, num_frames);
+
+ for (int i = 0; i < num_frames; ++i) {
+ fprintf(stderr, "#%-2d ", i);
+ PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]);
+ }
+ free(symbols);
+}
+
+void PrintStack(int first_frames_to_skip) {
+ const int kMaxFrames = 100;
+ void* frames[kMaxFrames];
+
+ auto num_frames = backtrace(frames, kMaxFrames);
+ PrintStack(&frames[first_frames_to_skip], num_frames - first_frames_to_skip);
+}
+
+void PrintAndFreeStack(void* callstack, int num_frames) {
+ PrintStack(static_cast<void**>(callstack), num_frames);
+ free(callstack);
+}
+
+void* SaveStack(int* num_frames, int first_frames_to_skip) {
+ const int kMaxFrames = 100;
+ void* frames[kMaxFrames];
+
+ auto count = backtrace(frames, kMaxFrames);
+ *num_frames = count - first_frames_to_skip;
+ void* callstack = malloc(sizeof(void*) * *num_frames);
+ memcpy(callstack, &frames[first_frames_to_skip], sizeof(void*) * *num_frames);
+ return callstack;
+}
+
+static void StackTraceHandler(int sig) {
+ // reset to default handler
+ signal(sig, SIG_DFL);
+ fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
+ // skip the top three signal handler related frames
+ PrintStack(3);
+
+ // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of
+ // a signal" have failed, so just warn the user about them.
+#ifdef __SANITIZE_THREAD__
+ fprintf(stderr,
+ "==> NOTE: any above warnings about \"signal-unsafe call\" are\n"
+ "==> ignorable, as they are expected when generating a stack\n"
+ "==> trace because of a signal under TSAN. Consider why the\n"
+ "==> signal was generated to begin with, and the stack trace\n"
+ "==> in the TSAN warning can be useful for that. (The stack\n"
+ "==> trace printed by the signal handler is likely obscured\n"
+ "==> by TSAN output.)\n");
+#endif
+
+ // re-signal to default handler (so we still get core dump if needed...)
+ raise(sig);
+}
+
+void InstallStackTraceHandler() {
+ // just use the plain old signal as it's simple and sufficient
+ // for this use case
+ signal(SIGILL, StackTraceHandler);
+ signal(SIGSEGV, StackTraceHandler);
+ signal(SIGBUS, StackTraceHandler);
+ signal(SIGABRT, StackTraceHandler);
+ // Allow ouside debugger to attach, even with Yama security restrictions
+#ifdef PR_SET_PTRACER_ANY
+ (void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
+#endif
+}
+
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/stack_trace.h b/src/rocksdb/port/stack_trace.h
new file mode 100644
index 000000000..5b3bf9320
--- /dev/null
+++ b/src/rocksdb/port/stack_trace.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// Install a signal handler to print callstack on the following signals:
+// SIGILL SIGSEGV SIGBUS SIGABRT
+// And also (Linux ony for now) overrides security settings to allow outside
+// processes to attach to this one as a debugger. ONLY USE FOR NON-SECURITY
+// CRITICAL PROCESSES such as unit tests or benchmarking tools.
+// Currently supports only some POSIX implementations. No-op otherwise.
+void InstallStackTraceHandler();
+
+// Prints stack, skips skip_first_frames frames
+void PrintStack(int first_frames_to_skip = 0);
+
+// Prints the given callstack
+void PrintAndFreeStack(void* callstack, int num_frames);
+
+// Save the current callstack
+void* SaveStack(int* num_frame, int first_frames_to_skip = 0);
+
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/sys_time.h b/src/rocksdb/port/sys_time.h
new file mode 100644
index 000000000..f2137526b
--- /dev/null
+++ b/src/rocksdb/port/sys_time.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This file is a portable substitute for sys/time.h which does not exist on
+// Windows
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#if defined(OS_WIN) && (defined(_MSC_VER) || defined(__MINGW32__))
+
+#include <time.h>
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace port {
+
+struct TimeVal {
+ long tv_sec;
+ long tv_usec;
+};
+
+void GetTimeOfDay(TimeVal* tv, struct timezone* tz);
+
+inline struct tm* LocalTimeR(const time_t* timep, struct tm* result) {
+ errno_t ret = localtime_s(result, timep);
+ return (ret == 0) ? result : NULL;
+}
+
+} // namespace port
+
+} // namespace ROCKSDB_NAMESPACE
+
+#else
+#include <sys/time.h>
+#include <time.h>
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace port {
+
+using TimeVal = struct timeval;
+
+inline void GetTimeOfDay(TimeVal* tv, struct timezone* tz) {
+ gettimeofday(tv, tz);
+}
+
+inline struct tm* LocalTimeR(const time_t* timep, struct tm* result) {
+ return localtime_r(timep, result);
+}
+
+} // namespace port
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/util_logger.h b/src/rocksdb/port/util_logger.h
new file mode 100644
index 000000000..ce7e3a941
--- /dev/null
+++ b/src/rocksdb/port/util_logger.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+// Include the appropriate platform specific file below. If you are
+// porting to a new platform, see "port_example.h" for documentation
+// of what the new port_<platform>.h file must provide.
+
+#if defined(OS_WIN)
+#include "port/win/win_logger.h"
+#endif
diff --git a/src/rocksdb/port/win/env_default.cc b/src/rocksdb/port/win/env_default.cc
new file mode 100644
index 000000000..48853f26e
--- /dev/null
+++ b/src/rocksdb/port/win/env_default.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include <mutex>
+
+#include "port/win/env_win.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/compression_context_cache.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// We choose not to destroy the env because joining the threads from the
+// system loader
+// which destroys the statics (same as from DLLMain) creates a system loader
+// dead-lock.
+// in this manner any remaining threads are terminated OK.
+namespace {
+std::once_flag winenv_once_flag;
+Env* envptr;
+}; // namespace
+} // namespace port
+
+Env* Env::Default() {
+ ThreadLocalPtr::InitSingletons();
+ CompressionContextCache::InitSingleton();
+ INIT_SYNC_POINT_SINGLETONS();
+ std::call_once(port::winenv_once_flag,
+ []() { port::envptr = new port::WinEnv(); });
+ return port::envptr;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/env_win.cc b/src/rocksdb/port/win/env_win.cc
new file mode 100644
index 000000000..2262eb59c
--- /dev/null
+++ b/src/rocksdb/port/win/env_win.cc
@@ -0,0 +1,1437 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include "port/win/env_win.h"
+
+#include <direct.h> // _rmdir, _mkdir, _getcwd
+#include <errno.h>
+#include <io.h> // _access
+#include <rpc.h> // for uuid generation
+#include <shlwapi.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <windows.h>
+#include <winioctl.h>
+
+#include <algorithm>
+#include <ctime>
+#include <thread>
+
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "port/lang.h"
+#include "port/port.h"
+#include "port/port_dirent.h"
+#include "port/win/io_win.h"
+#include "port/win/win_logger.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "strsafe.h"
+#include "util/string_util.h"
+
+// Undefine the functions windows might use (again)...
+#undef GetCurrentTime
+#undef DeleteFile
+#undef LoadLibrary
+
+namespace ROCKSDB_NAMESPACE {
+
+ThreadStatusUpdater* CreateThreadStatusUpdater() {
+ return new ThreadStatusUpdater();
+}
+
+namespace {
+
+// Sector size used when physical sector size cannot be obtained from device.
+static const size_t kSectorSize = 512;
+
+// RAII helpers for HANDLEs
+const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
+using UniqueCloseHandlePtr = std::unique_ptr<void, decltype(CloseHandleFunc)>;
+
+const auto FindCloseFunc = [](HANDLE h) { ::FindClose(h); };
+using UniqueFindClosePtr = std::unique_ptr<void, decltype(FindCloseFunc)>;
+
+void WinthreadCall(const char* label, std::error_code result) {
+ if (0 != result.value()) {
+ fprintf(stderr, "Winthread %s: %s\n", label,
+ errnoStr(result.value()).c_str());
+ abort();
+ }
+}
+
+} // namespace
+
+namespace port {
+WinClock::WinClock()
+ : perf_counter_frequency_(0),
+ nano_seconds_per_period_(0),
+ GetSystemTimePreciseAsFileTime_(NULL) {
+ {
+ LARGE_INTEGER qpf;
+ BOOL ret __attribute__((__unused__));
+ ret = QueryPerformanceFrequency(&qpf);
+ assert(ret == TRUE);
+ perf_counter_frequency_ = qpf.QuadPart;
+
+ if (std::nano::den % perf_counter_frequency_ == 0) {
+ nano_seconds_per_period_ = std::nano::den / perf_counter_frequency_;
+ }
+ }
+
+ HMODULE module = GetModuleHandle("kernel32.dll");
+ if (module != NULL) {
+ GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)(
+ void*)GetProcAddress(module, "GetSystemTimePreciseAsFileTime");
+ }
+}
+
+void WinClock::SleepForMicroseconds(int micros) {
+ std::this_thread::sleep_for(std::chrono::microseconds(micros));
+}
+
+std::string WinClock::TimeToString(uint64_t secondsSince1970) {
+ std::string result;
+
+ const time_t seconds = secondsSince1970;
+ const int maxsize = 64;
+
+ struct tm t;
+ errno_t ret = localtime_s(&t, &seconds);
+
+ if (ret) {
+ result = std::to_string(seconds);
+ } else {
+ result.resize(maxsize);
+ char* p = &result[0];
+
+ int len =
+ snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
+ t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
+ assert(len > 0);
+
+ result.resize(len);
+ }
+
+ return result;
+}
+
+uint64_t WinClock::NowMicros() {
+ if (GetSystemTimePreciseAsFileTime_ != NULL) {
+ // all std::chrono clocks on windows proved to return
+ // values that may repeat that is not good enough for some uses.
+ const int64_t c_UnixEpochStartTicks = 116444736000000000LL;
+ const int64_t c_FtToMicroSec = 10;
+
+ // This interface needs to return system time and not
+ // just any microseconds because it is often used as an argument
+ // to TimedWait() on condition variable
+ FILETIME ftSystemTime;
+ GetSystemTimePreciseAsFileTime_(&ftSystemTime);
+
+ LARGE_INTEGER li;
+ li.LowPart = ftSystemTime.dwLowDateTime;
+ li.HighPart = ftSystemTime.dwHighDateTime;
+ // Subtract unix epoch start
+ li.QuadPart -= c_UnixEpochStartTicks;
+ // Convert to microsecs
+ li.QuadPart /= c_FtToMicroSec;
+ return li.QuadPart;
+ }
+ return std::chrono::duration_cast<std::chrono::microseconds>(
+ std::chrono::system_clock::now().time_since_epoch())
+ .count();
+}
+
+uint64_t WinClock::NowNanos() {
+ if (nano_seconds_per_period_ != 0) {
+ // all std::chrono clocks on windows have the same resolution that is only
+ // good enough for microseconds but not nanoseconds
+ // On Windows 8 and Windows 2012 Server
+ // GetSystemTimePreciseAsFileTime(&current_time) can be used
+ LARGE_INTEGER li;
+ QueryPerformanceCounter(&li);
+ // Convert performance counter to nanoseconds by precomputed ratio.
+ // Directly multiply nano::den with li.QuadPart causes overflow.
+ // Only do this when nano::den is divisible by perf_counter_frequency_,
+ // which most likely is the case in reality. If it's not, fall back to
+ // high_resolution_clock, which may be less precise under old compilers.
+ li.QuadPart *= nano_seconds_per_period_;
+ return li.QuadPart;
+ }
+ return std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::high_resolution_clock::now().time_since_epoch())
+ .count();
+}
+
+Status WinClock::GetCurrentTime(int64_t* unix_time) {
+ time_t time = std::time(nullptr);
+ if (time == (time_t)(-1)) {
+ return Status::NotSupported("Failed to get time");
+ }
+
+ *unix_time = time;
+ return Status::OK();
+}
+
+WinFileSystem::WinFileSystem(const std::shared_ptr<SystemClock>& clock)
+ : clock_(clock), page_size_(4 * 1024), allocation_granularity_(page_size_) {
+ SYSTEM_INFO sinfo;
+ GetSystemInfo(&sinfo);
+
+ page_size_ = sinfo.dwPageSize;
+ allocation_granularity_ = sinfo.dwAllocationGranularity;
+}
+
+const std::shared_ptr<WinFileSystem>& WinFileSystem::Default() {
+ STATIC_AVOID_DESTRUCTION(std::shared_ptr<WinFileSystem>, fs)
+ (std::make_shared<WinFileSystem>(WinClock::Default()));
+ return fs;
+}
+
+WinEnvIO::WinEnvIO(Env* hosted_env) : hosted_env_(hosted_env) {}
+
+WinEnvIO::~WinEnvIO() {}
+
+IOStatus WinFileSystem::DeleteFile(const std::string& fname,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus result;
+
+ BOOL ret = RX_DeleteFile(RX_FN(fname).c_str());
+
+ if (!ret) {
+ auto lastError = GetLastError();
+ result = IOErrorFromWindowsError("Failed to delete: " + fname, lastError);
+ }
+
+ return result;
+}
+
+IOStatus WinFileSystem::Truncate(const std::string& fname, size_t size,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size);
+ if (result != 0) {
+ s = IOError("Failed to truncate: " + fname, errno);
+ }
+ return s;
+}
+
+IOStatus WinFileSystem::NewSequentialFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) {
+ IOStatus s;
+
+ result->reset();
+
+ // Corruption test needs to rename and delete files of these kind
+ // while they are still open with another handle. For that reason we
+ // allow share_write and delete(allows rename).
+ HANDLE hFile = INVALID_HANDLE_VALUE;
+
+ DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
+
+ if (options.use_direct_reads && !options.use_mmap_reads) {
+ fileFlags |= FILE_FLAG_NO_BUFFERING;
+ }
+
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ hFile = RX_CreateFile(
+ RX_FN(fname).c_str(), GENERIC_READ,
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
+ OPEN_EXISTING, // Original fopen mode is "rb"
+ fileFlags, NULL);
+ }
+
+ if (INVALID_HANDLE_VALUE == hFile) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname,
+ lastError);
+ } else {
+ result->reset(new WinSequentialFile(fname, hFile, options));
+ }
+ return s;
+}
+
+IOStatus WinFileSystem::NewRandomAccessFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+ result->reset();
+ IOStatus s;
+
+ // Open the file for read-only random access
+ // Random access is to disable read-ahead as the system reads too much data
+ DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
+
+ if (options.use_direct_reads && !options.use_mmap_reads) {
+ fileFlags |= FILE_FLAG_NO_BUFFERING;
+ } else {
+ fileFlags |= FILE_FLAG_RANDOM_ACCESS;
+ }
+
+ /// Shared access is necessary for corruption test to pass
+ // almost all tests would work with a possible exception of fault_injection
+ HANDLE hFile = 0;
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ hFile =
+ RX_CreateFile(RX_FN(fname).c_str(), GENERIC_READ,
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+ NULL, OPEN_EXISTING, fileFlags, NULL);
+ }
+
+ if (INVALID_HANDLE_VALUE == hFile) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError(
+ "NewRandomAccessFile failed to Create/Open: " + fname, lastError);
+ }
+
+ UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
+
+ // CAUTION! This will map the entire file into the process address space.
+ // Not recommended for 32-bit platforms.
+ if (options.use_mmap_reads) {
+ uint64_t fileSize;
+
+ s = GetFileSize(fname, IOOptions(), &fileSize, dbg);
+
+ if (s.ok()) {
+ // Will not map empty files
+ if (fileSize == 0) {
+ return IOError("NewRandomAccessFile failed to map empty file: " + fname,
+ EINVAL);
+ }
+
+ HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY,
+ 0, // At its present length
+ 0,
+ NULL); // Mapping name
+
+ if (!hMap) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError(
+ "Failed to create file mapping for NewRandomAccessFile: " + fname,
+ lastError);
+ }
+
+ UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
+
+ const void* mapped_region =
+ MapViewOfFileEx(hMap, FILE_MAP_READ,
+ 0, // High DWORD of access start
+ 0, // Low DWORD
+ static_cast<SIZE_T>(fileSize),
+ NULL); // Let the OS choose the mapping
+
+ if (!mapped_region) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError(
+ "Failed to MapViewOfFile for NewRandomAccessFile: " + fname,
+ lastError);
+ }
+
+ result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region,
+ static_cast<size_t>(fileSize)));
+
+ mapGuard.release();
+ fileGuard.release();
+ }
+ } else {
+ result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options));
+ fileGuard.release();
+ }
+ return s;
+}
+
+IOStatus WinFileSystem::OpenWritableFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result, bool reopen) {
+ const size_t c_BufferCapacity = 64 * 1024;
+
+ EnvOptions local_options(options);
+
+ result->reset();
+ IOStatus s;
+
+ DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
+
+ if (local_options.use_direct_writes && !local_options.use_mmap_writes) {
+ fileFlags = FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+ }
+
+ // Desired access. We are want to write only here but if we want to memory
+ // map
+ // the file then there is no write only mode so we have to create it
+ // Read/Write
+ // However, MapViewOfFile specifies only Write only
+ DWORD desired_access = GENERIC_WRITE;
+ DWORD shared_mode = FILE_SHARE_READ;
+
+ if (local_options.use_mmap_writes) {
+ desired_access |= GENERIC_READ;
+ } else {
+ // Adding this solely for tests to pass (fault_injection_test,
+ // wal_manager_test).
+ shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE);
+ }
+
+ // This will always truncate the file
+ DWORD creation_disposition = CREATE_ALWAYS;
+ if (reopen) {
+ creation_disposition = OPEN_ALWAYS;
+ }
+
+ HANDLE hFile = 0;
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ hFile = RX_CreateFile(
+ RX_FN(fname).c_str(),
+ desired_access, // Access desired
+ shared_mode,
+ NULL, // Security attributes
+ // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC
+ creation_disposition,
+ fileFlags, // Flags
+ NULL); // Template File
+ }
+
+ if (INVALID_HANDLE_VALUE == hFile) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError(
+ "Failed to create a NewWritableFile: " + fname, lastError);
+ }
+
+ // We will start writing at the end, appending
+ if (reopen) {
+ LARGE_INTEGER zero_move;
+ zero_move.QuadPart = 0;
+ BOOL ret = SetFilePointerEx(hFile, zero_move, NULL, FILE_END);
+ if (!ret) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError(
+ "Failed to create a ReopenWritableFile move to the end: " + fname,
+ lastError);
+ }
+ }
+
+ if (options.use_mmap_writes) {
+ // We usually do not use mmmapping on SSD and thus we pass memory
+ // page_size
+ result->reset(new WinMmapFile(fname, hFile, page_size_,
+ allocation_granularity_, local_options));
+ } else {
+ // Here we want the buffer allocation to be aligned by the SSD page size
+ // and to be a multiple of it
+ result->reset(new WinWritableFile(fname, hFile, GetPageSize(),
+ c_BufferCapacity, local_options));
+ }
+ return s;
+}
+
+IOStatus WinFileSystem::NewWritableFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* /*dbg*/) {
+ return OpenWritableFile(fname, options, result, false);
+}
+
+IOStatus WinFileSystem::ReopenWritableFile(
+ const std::string& fname, const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+ return OpenWritableFile(fname, options, result, true);
+}
+
+IOStatus WinFileSystem::NewRandomRWFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+
+ // Open the file for read-only random access
+ // Random access is to disable read-ahead as the system reads too much data
+ DWORD desired_access = GENERIC_READ | GENERIC_WRITE;
+ DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+ DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist
+ DWORD file_flags = FILE_FLAG_RANDOM_ACCESS;
+
+ if (options.use_direct_reads && options.use_direct_writes) {
+ file_flags |= FILE_FLAG_NO_BUFFERING;
+ }
+
+ /// Shared access is necessary for corruption test to pass
+ // almost all tests would work with a possible exception of fault_injection
+ HANDLE hFile = 0;
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ hFile = RX_CreateFile(RX_FN(fname).c_str(), desired_access, shared_mode,
+ NULL, // Security attributes
+ creation_disposition, file_flags, NULL);
+ }
+
+ if (INVALID_HANDLE_VALUE == hFile) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError(
+ "NewRandomRWFile failed to Create/Open: " + fname, lastError);
+ }
+
+ UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
+ result->reset(new WinRandomRWFile(fname, hFile, GetPageSize(), options));
+ fileGuard.release();
+
+ return s;
+}
+
+IOStatus WinFileSystem::NewMemoryMappedFileBuffer(
+ const std::string& fname, std::unique_ptr<MemoryMappedFileBuffer>* result) {
+ IOStatus s;
+ result->reset();
+
+ DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
+
+ HANDLE hFile = INVALID_HANDLE_VALUE;
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ hFile = RX_CreateFile(
+ RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE,
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
+ OPEN_EXISTING, // Open only if it exists
+ fileFlags, NULL);
+ }
+
+ if (INVALID_HANDLE_VALUE == hFile) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to open NewMemoryMappedFileBuffer: " + fname, lastError);
+ return s;
+ }
+ UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
+
+ uint64_t fileSize = 0;
+ s = GetFileSize(fname, IOOptions(), &fileSize, nullptr);
+ if (!s.ok()) {
+ return s;
+ }
+ // Will not map empty files
+ if (fileSize == 0) {
+ return IOStatus::NotSupported(
+ "NewMemoryMappedFileBuffer can not map zero length files: " + fname);
+ }
+
+ // size_t is 32-bit with 32-bit builds
+ if (fileSize > std::numeric_limits<size_t>::max()) {
+ return IOStatus::NotSupported(
+ "The specified file size does not fit into 32-bit memory addressing: " +
+ fname);
+ }
+
+ HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE,
+ 0, // Whole file at its present length
+ 0,
+ NULL); // Mapping name
+
+ if (!hMap) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError(
+ "Failed to create file mapping for: " + fname, lastError);
+ }
+ UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
+
+ void* base = MapViewOfFileEx(hMap, FILE_MAP_WRITE,
+ 0, // High DWORD of access start
+ 0, // Low DWORD
+ static_cast<SIZE_T>(fileSize),
+ NULL); // Let the OS choose the mapping
+
+ if (!base) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError(
+ "Failed to MapViewOfFile for NewMemoryMappedFileBuffer: " + fname,
+ lastError);
+ }
+
+ result->reset(new WinMemoryMappedBuffer(hFile, hMap, base,
+ static_cast<size_t>(fileSize)));
+
+ mapGuard.release();
+ fileGuard.release();
+
+ return s;
+}
+
+IOStatus WinFileSystem::NewDirectory(const std::string& name,
+ const IOOptions& /*options*/,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ // Must be nullptr on failure
+ result->reset();
+
+ if (!DirExists(name)) {
+ s = IOErrorFromWindowsError("open folder: " + name, ERROR_DIRECTORY);
+ return s;
+ }
+
+ HANDLE handle = INVALID_HANDLE_VALUE;
+ // 0 - for access means read metadata
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ handle = RX_CreateFile(
+ RX_FN(name).c_str(), 0,
+ FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+ OPEN_EXISTING,
+ FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+ NULL);
+ }
+
+ if (INVALID_HANDLE_VALUE == handle) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("open folder: " + name, lastError);
+ return s;
+ }
+
+ result->reset(new WinDirectory(name, handle));
+
+ return s;
+}
+
+IOStatus WinFileSystem::FileExists(const std::string& fname,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ // TODO: This does not follow symbolic links at this point
+ // which is consistent with _access() impl on windows
+ // but can be added
+ WIN32_FILE_ATTRIBUTE_DATA attrs;
+ if (FALSE == RX_GetFileAttributesEx(RX_FN(fname).c_str(),
+ GetFileExInfoStandard, &attrs)) {
+ auto lastError = GetLastError();
+ switch (lastError) {
+ case ERROR_ACCESS_DENIED:
+ case ERROR_NOT_FOUND:
+ case ERROR_FILE_NOT_FOUND:
+ case ERROR_PATH_NOT_FOUND:
+ s = IOStatus::NotFound();
+ break;
+ default:
+ s = IOErrorFromWindowsError("Unexpected error for: " + fname,
+ lastError);
+ break;
+ }
+ }
+ return s;
+}
+
+IOStatus WinFileSystem::GetChildren(const std::string& dir,
+ const IOOptions& /*opts*/,
+ std::vector<std::string>* result,
+ IODebugContext* /*dbg*/) {
+ IOStatus status;
+ result->clear();
+
+ RX_WIN32_FIND_DATA data;
+ memset(&data, 0, sizeof(data));
+ std::string pattern(dir);
+ pattern.append("\\").append("*");
+
+ HANDLE handle =
+ RX_FindFirstFileEx(RX_FN(pattern).c_str(),
+ // Do not want alternative name
+ FindExInfoBasic, &data, FindExSearchNameMatch,
+ NULL, // lpSearchFilter
+ 0);
+
+ if (handle == INVALID_HANDLE_VALUE) {
+ auto lastError = GetLastError();
+ switch (lastError) {
+ case ERROR_NOT_FOUND:
+ case ERROR_ACCESS_DENIED:
+ case ERROR_FILE_NOT_FOUND:
+ case ERROR_PATH_NOT_FOUND:
+ status = IOStatus::NotFound();
+ break;
+ default:
+ status = IOErrorFromWindowsError("Failed to GetChhildren for: " + dir,
+ lastError);
+ }
+ return status;
+ }
+
+ UniqueFindClosePtr fc(handle, FindCloseFunc);
+
+ // For safety
+ data.cFileName[MAX_PATH - 1] = 0;
+
+ while (true) {
+ // filter out '.' and '..' directory entries
+ // which appear only on some platforms
+ const bool ignore =
+ ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0) &&
+ (RX_FNCMP(data.cFileName, ".") == 0 ||
+ RX_FNCMP(data.cFileName, "..") == 0);
+ if (!ignore) {
+ auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName));
+ result->push_back(FN_TO_RX(x));
+ }
+
+ BOOL ret = -RX_FindNextFile(handle, &data);
+ // If the function fails the return value is zero
+ // and non-zero otherwise. Not TRUE or FALSE.
+ if (ret == FALSE) {
+ // Posix does not care why we stopped
+ break;
+ }
+ data.cFileName[MAX_PATH - 1] = 0;
+ }
+ return status;
+}
+
+IOStatus WinFileSystem::CreateDir(const std::string& name,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus result;
+ BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
+ if (!ret) {
+ auto lastError = GetLastError();
+ result = IOErrorFromWindowsError("Failed to create a directory: " + name,
+ lastError);
+ }
+
+ return result;
+}
+
+IOStatus WinFileSystem::CreateDirIfMissing(const std::string& name,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus result;
+
+ if (DirExists(name)) {
+ return result;
+ }
+
+ BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
+ if (!ret) {
+ auto lastError = GetLastError();
+ if (lastError != ERROR_ALREADY_EXISTS) {
+ result = IOErrorFromWindowsError("Failed to create a directory: " + name,
+ lastError);
+ } else {
+ result = IOStatus::IOError(name + ": exists but is not a directory");
+ }
+ }
+ return result;
+}
+
+IOStatus WinFileSystem::DeleteDir(const std::string& name,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus result;
+ BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str());
+ if (!ret) {
+ auto lastError = GetLastError();
+ result =
+ IOErrorFromWindowsError("Failed to remove dir: " + name, lastError);
+ }
+ return result;
+}
+
+IOStatus WinFileSystem::GetFileSize(const std::string& fname,
+ const IOOptions& /*opts*/, uint64_t* size,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+
+ WIN32_FILE_ATTRIBUTE_DATA attrs;
+ if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
+ &attrs)) {
+ ULARGE_INTEGER file_size;
+ file_size.HighPart = attrs.nFileSizeHigh;
+ file_size.LowPart = attrs.nFileSizeLow;
+ *size = file_size.QuadPart;
+ } else {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError);
+ }
+ return s;
+}
+
+uint64_t WinFileSystem::FileTimeToUnixTime(const FILETIME& ftTime) {
+ const uint64_t c_FileTimePerSecond = 10000000U;
+ // UNIX epoch starts on 1970-01-01T00:00:00Z
+ // Windows FILETIME starts on 1601-01-01T00:00:00Z
+ // Therefore, we need to subtract the below number of seconds from
+ // the seconds that we obtain from FILETIME with an obvious loss of
+ // precision
+ const uint64_t c_SecondBeforeUnixEpoch = 11644473600U;
+
+ ULARGE_INTEGER li;
+ li.HighPart = ftTime.dwHighDateTime;
+ li.LowPart = ftTime.dwLowDateTime;
+
+ uint64_t result =
+ (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
+ return result;
+}
+
+IOStatus WinFileSystem::GetFileModificationTime(const std::string& fname,
+ const IOOptions& /*opts*/,
+ uint64_t* file_mtime,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+
+ WIN32_FILE_ATTRIBUTE_DATA attrs;
+ if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
+ &attrs)) {
+ *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
+ } else {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Can not get file modification time for: " + fname, lastError);
+ *file_mtime = 0;
+ }
+
+ return s;
+}
+
+IOStatus WinFileSystem::RenameFile(const std::string& src,
+ const std::string& target,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus result;
+
+ // rename() is not capable of replacing the existing file as on Linux
+ // so use OS API directly
+ if (!RX_MoveFileEx(RX_FN(src).c_str(), RX_FN(target).c_str(),
+ MOVEFILE_REPLACE_EXISTING)) {
+ DWORD lastError = GetLastError();
+
+ std::string text("Failed to rename: ");
+ text.append(src).append(" to: ").append(target);
+
+ result = IOErrorFromWindowsError(text, lastError);
+ }
+
+ return result;
+}
+
+IOStatus WinFileSystem::LinkFile(const std::string& src,
+ const std::string& target,
+ const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus result;
+
+ if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) {
+ DWORD lastError = GetLastError();
+ if (lastError == ERROR_NOT_SAME_DEVICE) {
+ return IOStatus::NotSupported("No cross FS links allowed");
+ }
+
+ std::string text("Failed to link: ");
+ text.append(src).append(" to: ").append(target);
+
+ result = IOErrorFromWindowsError(text, lastError);
+ }
+
+ return result;
+}
+
+IOStatus WinFileSystem::NumFileLinks(const std::string& fname,
+ const IOOptions& /*opts*/, uint64_t* count,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ HANDLE handle =
+ RX_CreateFile(RX_FN(fname).c_str(), 0,
+ FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+ NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+
+ if (INVALID_HANDLE_VALUE == handle) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("NumFileLinks: " + fname, lastError);
+ return s;
+ }
+ UniqueCloseHandlePtr handle_guard(handle, CloseHandleFunc);
+ FILE_STANDARD_INFO standard_info;
+ if (0 != GetFileInformationByHandleEx(handle, FileStandardInfo,
+ &standard_info,
+ sizeof(standard_info))) {
+ *count = standard_info.NumberOfLinks;
+ } else {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("GetFileInformationByHandleEx: " + fname,
+ lastError);
+ }
+ return s;
+}
+
+IOStatus WinFileSystem::AreFilesSame(const std::string& first,
+ const std::string& second,
+ const IOOptions& /*opts*/, bool* res,
+ IODebugContext* /*dbg*/) {
+// For MinGW builds
+#if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
+ IOStatus s = IOStatus::NotSupported();
+#else
+ assert(res != nullptr);
+ IOStatus s;
+ if (res == nullptr) {
+ s = IOStatus::InvalidArgument("res");
+ return s;
+ }
+
+ // 0 - for access means read metadata
+ HANDLE file_1 = RX_CreateFile(
+ RX_FN(first).c_str(), 0,
+ FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+ OPEN_EXISTING,
+ FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+ NULL);
+
+ if (INVALID_HANDLE_VALUE == file_1) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("open file: " + first, lastError);
+ return s;
+ }
+ UniqueCloseHandlePtr g_1(file_1, CloseHandleFunc);
+
+ HANDLE file_2 = RX_CreateFile(
+ RX_FN(second).c_str(), 0,
+ FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+ OPEN_EXISTING,
+ FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+ NULL);
+
+ if (INVALID_HANDLE_VALUE == file_2) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("open file: " + second, lastError);
+ return s;
+ }
+ UniqueCloseHandlePtr g_2(file_2, CloseHandleFunc);
+
+ FILE_ID_INFO FileInfo_1;
+ BOOL result = GetFileInformationByHandleEx(file_1, FileIdInfo, &FileInfo_1,
+ sizeof(FileInfo_1));
+
+ if (!result) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("stat file: " + first, lastError);
+ return s;
+ }
+
+ FILE_ID_INFO FileInfo_2;
+ result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2,
+ sizeof(FileInfo_2));
+
+ if (!result) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("stat file: " + second, lastError);
+ return s;
+ }
+
+ if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) {
+ *res =
+ (0 == memcmp(FileInfo_1.FileId.Identifier, FileInfo_2.FileId.Identifier,
+ sizeof(FileInfo_1.FileId.Identifier)));
+ } else {
+ *res = false;
+ }
+#endif
+ return s;
+}
+
+IOStatus WinFileSystem::LockFile(const std::string& lockFname,
+ const IOOptions& /*opts*/, FileLock** lock,
+ IODebugContext* /*dbg*/) {
+ assert(lock != nullptr);
+
+ *lock = NULL;
+ IOStatus result;
+
+ // No-sharing, this is a LOCK file
+ const DWORD ExclusiveAccessON = 0;
+
+ // Obtain exclusive access to the LOCK file
+ // Previously, instead of NORMAL attr we set DELETE on close and that worked
+ // well except with fault_injection test that insists on deleting it.
+ HANDLE hFile = 0;
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ hFile = RX_CreateFile(RX_FN(lockFname).c_str(),
+ (GENERIC_READ | GENERIC_WRITE), ExclusiveAccessON,
+ NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+ }
+
+ if (INVALID_HANDLE_VALUE == hFile) {
+ auto lastError = GetLastError();
+ result = IOErrorFromWindowsError("Failed to create lock file: " + lockFname,
+ lastError);
+ } else {
+ *lock = new WinFileLock(hFile);
+ }
+
+ return result;
+}
+
+IOStatus WinFileSystem::UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus result;
+
+ assert(lock != nullptr);
+
+ delete lock;
+
+ return result;
+}
+
+IOStatus WinFileSystem::GetTestDirectory(const IOOptions& opts,
+ std::string* result,
+ IODebugContext* dbg) {
+ std::string output;
+
+ const char* env = getenv("TEST_TMPDIR");
+ if (env && env[0] != '\0') {
+ output = env;
+ } else {
+ env = getenv("TMP");
+
+ if (env && env[0] != '\0') {
+ output = env;
+ } else {
+ output = "c:\\tmp";
+ }
+ }
+ CreateDir(output, opts, dbg);
+
+ output.append("\\testrocksdb-");
+ output.append(std::to_string(GetCurrentProcessId()));
+
+ CreateDir(output, opts, dbg);
+
+ output.swap(*result);
+
+ return IOStatus::OK();
+}
+
+IOStatus WinFileSystem::NewLogger(const std::string& fname,
+ const IOOptions& /*opts*/,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+
+ result->reset();
+
+ HANDLE hFile = 0;
+ {
+ IOSTATS_TIMER_GUARD(open_nanos);
+ hFile = RX_CreateFile(
+ RX_FN(fname).c_str(), GENERIC_WRITE,
+ FILE_SHARE_READ | FILE_SHARE_DELETE, // In RocksDb log files are
+ // renamed and deleted before
+ // they are closed. This enables
+ // doing so.
+ NULL,
+ CREATE_ALWAYS, // Original fopen mode is "w"
+ FILE_ATTRIBUTE_NORMAL, NULL);
+ }
+
+ if (INVALID_HANDLE_VALUE == hFile) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError);
+ } else {
+ {
+ // With log files we want to set the true creation time as of now
+ // because the system
+ // for some reason caches the attributes of the previous file that just
+ // been renamed from
+ // this name so auto_roll_logger_test fails
+ FILETIME ft;
+ GetSystemTimeAsFileTime(&ft);
+ // Set creation, last access and last write time to the same value
+ SetFileTime(hFile, &ft, &ft, &ft);
+ }
+ result->reset(new WinLogger(&WinEnvThreads::gettid, clock_.get(), hFile));
+ }
+ return s;
+}
+
+IOStatus WinFileSystem::IsDirectory(const std::string& path,
+ const IOOptions& /*opts*/, bool* is_dir,
+ IODebugContext* /*dbg*/) {
+ BOOL ret = RX_PathIsDirectory(RX_FN(path).c_str());
+ if (is_dir) {
+ *is_dir = ret ? true : false;
+ }
+ return IOStatus::OK();
+}
+
+Status WinEnvIO::GetHostName(char* name, uint64_t len) {
+ Status s;
+ DWORD nSize = static_cast<DWORD>(
+ std::min<uint64_t>(len, std::numeric_limits<DWORD>::max()));
+
+ if (!::GetComputerNameA(name, &nSize)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("GetHostName", lastError);
+ } else {
+ name[nSize] = 0;
+ }
+
+ return s;
+}
+
+IOStatus WinFileSystem::GetAbsolutePath(const std::string& db_path,
+ const IOOptions& /*options*/,
+ std::string* output_path,
+ IODebugContext* dbg) {
+ // Check if we already have an absolute path
+ // For test compatibility we will consider starting slash as an
+ // absolute path
+ if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) ||
+ !RX_PathIsRelative(RX_FN(db_path).c_str())) {
+ *output_path = db_path;
+ return IOStatus::OK();
+ }
+
+ RX_FILESTRING result;
+ result.resize(MAX_PATH);
+
+ // Hopefully no changes the current directory while we do this
+ // however _getcwd also suffers from the same limitation
+ DWORD len = RX_GetCurrentDirectory(MAX_PATH, &result[0]);
+ if (len == 0) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError("Failed to get current working directory",
+ lastError);
+ }
+
+ result.resize(len);
+ std::string res = FN_TO_RX(result);
+
+ res.swap(*output_path);
+ return IOStatus::OK();
+}
+
+IOStatus WinFileSystem::GetFreeSpace(const std::string& path,
+ const IOOptions& /*options*/,
+ uint64_t* diskfree,
+ IODebugContext* /*dbg*/) {
+ assert(diskfree != nullptr);
+ ULARGE_INTEGER freeBytes;
+ BOOL f = RX_GetDiskFreeSpaceEx(RX_FN(path).c_str(), &freeBytes, NULL, NULL);
+ if (f) {
+ *diskfree = freeBytes.QuadPart;
+ return IOStatus::OK();
+ } else {
+ DWORD lastError = GetLastError();
+ return IOErrorFromWindowsError("Failed to get free space: " + path,
+ lastError);
+ }
+}
+
+FileOptions WinFileSystem::OptimizeForLogWrite(
+ const FileOptions& file_options, const DBOptions& db_options) const {
+ FileOptions optimized(file_options);
+ // These two the same as default optimizations
+ optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
+ optimized.writable_file_max_buffer_size =
+ db_options.writable_file_max_buffer_size;
+
+ // This adversely affects %999 on windows
+ optimized.use_mmap_writes = false;
+ // Direct writes will produce a huge perf impact on
+ // Windows. Pre-allocate space for WAL.
+ optimized.use_direct_writes = false;
+ return optimized;
+}
+
+FileOptions WinFileSystem::OptimizeForManifestWrite(
+ const FileOptions& options) const {
+ FileOptions optimized(options);
+ optimized.use_mmap_writes = false;
+ optimized.use_direct_reads = false;
+ return optimized;
+}
+
+FileOptions WinFileSystem::OptimizeForManifestRead(
+ const FileOptions& file_options) const {
+ FileOptions optimized(file_options);
+ optimized.use_mmap_writes = false;
+ optimized.use_direct_reads = false;
+ return optimized;
+}
+
+// Returns true iff the named directory exists and is a directory.
+bool WinFileSystem::DirExists(const std::string& dname) {
+ WIN32_FILE_ATTRIBUTE_DATA attrs;
+ if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), GetFileExInfoStandard,
+ &attrs)) {
+ return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
+ }
+ return false;
+}
+
+size_t WinFileSystem::GetSectorSize(const std::string& fname) {
+ size_t sector_size = kSectorSize;
+
+ // obtain device handle
+ char devicename[7] = "\\\\.\\";
+ int erresult = 0;
+ if (RX_PathIsRelative(RX_FN(fname).c_str())) {
+ RX_FILESTRING rx_current_dir;
+ rx_current_dir.resize(MAX_PATH);
+ DWORD len = RX_GetCurrentDirectory(MAX_PATH, &rx_current_dir[0]);
+ if (len == 0) {
+ return sector_size;
+ }
+ rx_current_dir.resize(len);
+ std::string current_dir = FN_TO_RX(rx_current_dir);
+ erresult =
+ strncat_s(devicename, sizeof(devicename), current_dir.c_str(), 2);
+ } else {
+ erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2);
+ }
+
+ if (erresult) {
+ assert(false);
+ return sector_size;
+ }
+
+ HANDLE hDevice = CreateFile(devicename, 0, 0, nullptr, OPEN_EXISTING,
+ FILE_ATTRIBUTE_NORMAL, nullptr);
+
+ if (hDevice == INVALID_HANDLE_VALUE) {
+ return sector_size;
+ }
+
+ STORAGE_PROPERTY_QUERY spropertyquery;
+ spropertyquery.PropertyId = StorageAccessAlignmentProperty;
+ spropertyquery.QueryType = PropertyStandardQuery;
+
+ BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)];
+ DWORD output_bytes = 0;
+
+ BOOL ret = DeviceIoControl(
+ hDevice, IOCTL_STORAGE_QUERY_PROPERTY, &spropertyquery,
+ sizeof(spropertyquery), output_buffer,
+ sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), &output_bytes, nullptr);
+
+ if (ret) {
+ sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR*)output_buffer)
+ ->BytesPerLogicalSector;
+ } else {
+ // many devices do not support StorageProcessAlignmentProperty. Any failure
+ // here and we fall back to logical alignment
+
+ DISK_GEOMETRY_EX geometry = {0};
+ ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, nullptr, 0,
+ &geometry, sizeof(geometry), &output_bytes, nullptr);
+ if (ret) {
+ sector_size = geometry.Geometry.BytesPerSector;
+ }
+ }
+
+ if (hDevice != INVALID_HANDLE_VALUE) {
+ CloseHandle(hDevice);
+ }
+
+ return sector_size;
+}
+
+////////////////////////////////////////////////////////////////////////
+// WinEnvThreads
+
+WinEnvThreads::WinEnvThreads(Env* hosted_env)
+ : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) {
+ for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+ thread_pools_[pool_id].SetThreadPriority(
+ static_cast<Env::Priority>(pool_id));
+ // This allows later initializing the thread-local-env of each thread.
+ thread_pools_[pool_id].SetHostEnv(hosted_env);
+ }
+}
+
+WinEnvThreads::~WinEnvThreads() {
+ WaitForJoin();
+
+ for (auto& thpool : thread_pools_) {
+ thpool.JoinAllThreads();
+ }
+}
+
+void WinEnvThreads::Schedule(void (*function)(void*), void* arg,
+ Env::Priority pri, void* tag,
+ void (*unschedFunction)(void* arg)) {
+ assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+ thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
+}
+
+int WinEnvThreads::UnSchedule(void* arg, Env::Priority pri) {
+ return thread_pools_[pri].UnSchedule(arg);
+}
+
+namespace {
+
+struct StartThreadState {
+ void (*user_function)(void*);
+ void* arg;
+};
+
+void* StartThreadWrapper(void* arg) {
+ std::unique_ptr<StartThreadState> state(
+ reinterpret_cast<StartThreadState*>(arg));
+ state->user_function(state->arg);
+ return nullptr;
+}
+
+} // namespace
+
+void WinEnvThreads::StartThread(void (*function)(void* arg), void* arg) {
+ std::unique_ptr<StartThreadState> state(new StartThreadState);
+ state->user_function = function;
+ state->arg = arg;
+ try {
+ Thread th(&StartThreadWrapper, state.get());
+ state.release();
+
+ std::lock_guard<std::mutex> lg(mu_);
+ threads_to_join_.push_back(std::move(th));
+
+ } catch (const std::system_error& ex) {
+ WinthreadCall("start thread", ex.code());
+ }
+}
+
+void WinEnvThreads::WaitForJoin() {
+ for (auto& th : threads_to_join_) {
+ th.join();
+ }
+ threads_to_join_.clear();
+}
+
+unsigned int WinEnvThreads::GetThreadPoolQueueLen(Env::Priority pri) const {
+ assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+ return thread_pools_[pri].GetQueueLen();
+}
+
+int WinEnvThreads::ReserveThreads(int threads_to_reserved, Env::Priority pri) {
+ assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+ return thread_pools_[pri].ReserveThreads(threads_to_reserved);
+}
+
+int WinEnvThreads::ReleaseThreads(int threads_to_released, Env::Priority pri) {
+ assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+ return thread_pools_[pri].ReleaseThreads(threads_to_released);
+}
+
+uint64_t WinEnvThreads::gettid() {
+ uint64_t thread_id = GetCurrentThreadId();
+ return thread_id;
+}
+
+uint64_t WinEnvThreads::GetThreadID() const { return gettid(); }
+
+void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) {
+ assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+ thread_pools_[pri].SetBackgroundThreads(num);
+}
+
+int WinEnvThreads::GetBackgroundThreads(Env::Priority pri) {
+ assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+ return thread_pools_[pri].GetBackgroundThreads();
+}
+
+void WinEnvThreads::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
+ assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+ thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
+}
+
+/////////////////////////////////////////////////////////////////////////
+// WinEnv
+
+WinEnv::WinEnv()
+ : CompositeEnv(WinFileSystem::Default(), WinClock::Default()),
+ winenv_io_(this),
+ winenv_threads_(this) {
+ // Protected member of the base class
+ thread_status_updater_ = CreateThreadStatusUpdater();
+}
+
+WinEnv::~WinEnv() {
+ // All threads must be joined before the deletion of
+ // thread_status_updater_.
+ delete thread_status_updater_;
+}
+
+Status WinEnv::GetThreadList(std::vector<ThreadStatus>* thread_list) {
+ assert(thread_status_updater_);
+ return thread_status_updater_->GetThreadList(thread_list);
+}
+
+Status WinEnv::GetHostName(char* name, uint64_t len) {
+ return winenv_io_.GetHostName(name, len);
+}
+
+void WinEnv::Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+ void* tag, void (*unschedFunction)(void* arg)) {
+ return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction);
+}
+
+int WinEnv::UnSchedule(void* arg, Env::Priority pri) {
+ return winenv_threads_.UnSchedule(arg, pri);
+}
+
+void WinEnv::StartThread(void (*function)(void* arg), void* arg) {
+ return winenv_threads_.StartThread(function, arg);
+}
+
+void WinEnv::WaitForJoin() { return winenv_threads_.WaitForJoin(); }
+
+unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const {
+ return winenv_threads_.GetThreadPoolQueueLen(pri);
+}
+int WinEnv::ReserveThreads(int threads_to_reserved, Env::Priority pri) {
+ return winenv_threads_.ReserveThreads(threads_to_reserved, pri);
+}
+
+int WinEnv::ReleaseThreads(int threads_to_released, Env::Priority pri) {
+ return winenv_threads_.ReleaseThreads(threads_to_released, pri);
+}
+
+uint64_t WinEnv::GetThreadID() const { return winenv_threads_.GetThreadID(); }
+
+// Allow increasing the number of worker threads.
+void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) {
+ return winenv_threads_.SetBackgroundThreads(num, pri);
+}
+
+int WinEnv::GetBackgroundThreads(Env::Priority pri) {
+ return winenv_threads_.GetBackgroundThreads(pri);
+}
+
+void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
+ return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri);
+}
+
+} // namespace port
+
+std::shared_ptr<FileSystem> FileSystem::Default() {
+ return port::WinFileSystem::Default();
+}
+
+const std::shared_ptr<SystemClock>& SystemClock::Default() {
+ STATIC_AVOID_DESTRUCTION(std::shared_ptr<SystemClock>, clock)
+ (std::make_shared<port::WinClock>());
+ return clock;
+}
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/env_win.h b/src/rocksdb/port/win/env_win.h
new file mode 100644
index 000000000..8fbfb8246
--- /dev/null
+++ b/src/rocksdb/port/win/env_win.h
@@ -0,0 +1,304 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc. Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#pragma once
+#include <stdint.h>
+#include <windows.h>
+
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "util/threadpool_imp.h"
+
+#undef GetCurrentTime
+#undef DeleteFile
+#undef LoadLibrary
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// Currently not designed for inheritance but rather a replacement
+class WinEnvThreads {
+ public:
+ explicit WinEnvThreads(Env* hosted_env);
+
+ ~WinEnvThreads();
+
+ WinEnvThreads(const WinEnvThreads&) = delete;
+ WinEnvThreads& operator=(const WinEnvThreads&) = delete;
+
+ void Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+ void* tag, void (*unschedFunction)(void* arg));
+
+ int UnSchedule(void* arg, Env::Priority pri);
+
+ void StartThread(void (*function)(void* arg), void* arg);
+
+ void WaitForJoin();
+
+ unsigned int GetThreadPoolQueueLen(Env::Priority pri) const;
+
+ int ReserveThreads(int threads_to_be_reserved, Env::Priority pri);
+
+ int ReleaseThreads(int threads_to_be_released, Env::Priority pri);
+
+ static uint64_t gettid();
+
+ uint64_t GetThreadID() const;
+
+ // Allow increasing the number of worker threads.
+ void SetBackgroundThreads(int num, Env::Priority pri);
+ int GetBackgroundThreads(Env::Priority pri);
+
+ void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri);
+
+ private:
+ Env* hosted_env_;
+ mutable std::mutex mu_;
+ std::vector<ThreadPoolImpl> thread_pools_;
+ std::vector<Thread> threads_to_join_;
+};
+
+class WinClock : public SystemClock {
+ public:
+ WinClock();
+ virtual ~WinClock() {}
+
+ static const char* kClassName() { return "WindowsClock"; }
+ const char* Name() const override { return kDefaultName(); }
+ const char* NickName() const override { return kClassName(); }
+
+ uint64_t NowMicros() override;
+
+ uint64_t NowNanos() override;
+
+ // 0 indicates not supported
+ uint64_t CPUMicros() override { return 0; }
+ void SleepForMicroseconds(int micros) override;
+
+ Status GetCurrentTime(int64_t* unix_time) override;
+ // Converts seconds-since-Jan-01-1970 to a printable string
+ virtual std::string TimeToString(uint64_t time);
+
+ uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; }
+
+ private:
+ using FnGetSystemTimePreciseAsFileTime = VOID(WINAPI*)(LPFILETIME);
+
+ uint64_t perf_counter_frequency_;
+ uint64_t nano_seconds_per_period_;
+ FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
+};
+
+class WinFileSystem : public FileSystem {
+ public:
+ static const std::shared_ptr<WinFileSystem>& Default();
+ WinFileSystem(const std::shared_ptr<SystemClock>& clock);
+ ~WinFileSystem() {}
+ static const char* kClassName() { return "WinFS"; }
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const { return kDefaultName(); }
+
+ static size_t GetSectorSize(const std::string& fname);
+ size_t GetPageSize() const { return page_size_; }
+ size_t GetAllocationGranularity() const { return allocation_granularity_; }
+
+ IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ // Truncate the named file to the specified size.
+ IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override;
+ IOStatus NewSequentialFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSSequentialFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomAccessFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSRandomAccessFile>* result,
+ IODebugContext* /*dbg*/) override;
+ IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+ std::unique_ptr<FSWritableFile>* r,
+ IODebugContext* dbg) override;
+ IOStatus ReopenWritableFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ IODebugContext* dbg) override;
+
+ IOStatus NewRandomRWFile(const std::string& fname,
+ const FileOptions& file_opts,
+ std::unique_ptr<FSRandomRWFile>* result,
+ IODebugContext* dbg) override;
+ IOStatus NewMemoryMappedFileBuffer(
+ const std::string& fname,
+ std::unique_ptr<MemoryMappedFileBuffer>* result) override;
+
+ IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+ std::unique_ptr<FSDirectory>* result,
+ IODebugContext* dbg) override;
+ IOStatus FileExists(const std::string& f, const IOOptions& io_opts,
+ IODebugContext* dbg) override;
+ IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+ std::vector<std::string>* r,
+ IODebugContext* dbg) override;
+ IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ // Creates directory if missing. Return Ok if it exists, or successful in
+ // Creating.
+ IOStatus CreateDirIfMissing(const std::string& dirname,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ // Delete the specified directory.
+ IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+ IODebugContext* dbg) override;
+ // Store the size of fname in *file_size.
+ IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+ uint64_t* file_size, IODebugContext* dbg) override;
+ // Store the last modification time of fname in *file_mtime.
+ IOStatus GetFileModificationTime(const std::string& fname,
+ const IOOptions& options,
+ uint64_t* file_mtime,
+ IODebugContext* dbg) override;
+ // Rename file src to target.
+ IOStatus RenameFile(const std::string& src, const std::string& target,
+ const IOOptions& options, IODebugContext* dbg) override;
+
+ // Hard Link file src to target.
+ IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) override;
+ IOStatus NumFileLinks(const std::string& /*fname*/,
+ const IOOptions& /*options*/, uint64_t* /*count*/,
+ IODebugContext* /*dbg*/) override;
+ IOStatus AreFilesSame(const std::string& /*first*/,
+ const std::string& /*second*/,
+ const IOOptions& /*options*/, bool* /*res*/,
+ IODebugContext* /*dbg*/) override;
+ IOStatus LockFile(const std::string& fname, const IOOptions& options,
+ FileLock** lock, IODebugContext* dbg) override;
+ IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+ IODebugContext* dbg) override;
+
+ // Create and returns a default logger (an instance of EnvLogger) for storing
+ // informational messages. Derived classes can override to provide custom
+ // logger.
+ IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+ std::shared_ptr<Logger>* result,
+ IODebugContext* dbg) override;
+ // Get full directory name for this db.
+ IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+ std::string* output_path,
+ IODebugContext* dbg) override;
+ IOStatus IsDirectory(const std::string& /*path*/, const IOOptions& options,
+ bool* is_dir, IODebugContext* /*dgb*/) override;
+ // This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+ IOStatus GetFreeSpace(const std::string& /*path*/,
+ const IOOptions& /*options*/, uint64_t* /*diskfree*/,
+ IODebugContext* /*dbg*/) override;
+ FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+ const DBOptions& db_options) const override;
+ FileOptions OptimizeForManifestRead(
+ const FileOptions& file_options) const override;
+ FileOptions OptimizeForManifestWrite(
+ const FileOptions& file_options) const override;
+
+ protected:
+ static uint64_t FileTimeToUnixTime(const FILETIME& ftTime);
+ // Returns true iff the named directory exists and is a directory.
+
+ virtual bool DirExists(const std::string& dname);
+ // Helper for NewWritable and ReopenWritableFile
+ virtual IOStatus OpenWritableFile(const std::string& fname,
+ const FileOptions& options,
+ std::unique_ptr<FSWritableFile>* result,
+ bool reopen);
+
+ private:
+ std::shared_ptr<SystemClock> clock_;
+ size_t page_size_;
+ size_t allocation_granularity_;
+};
+
+// Designed for inheritance so can be re-used
+// but certain parts replaced
+class WinEnvIO {
+ public:
+ explicit WinEnvIO(Env* hosted_env);
+
+ virtual ~WinEnvIO();
+
+ virtual Status GetHostName(char* name, uint64_t len);
+
+ private:
+ Env* hosted_env_;
+};
+
+class WinEnv : public CompositeEnv {
+ public:
+ WinEnv();
+
+ ~WinEnv();
+ static const char* kClassName() { return "WinEnv"; }
+ const char* Name() const override { return kClassName(); }
+ const char* NickName() const override { return kDefaultName(); }
+
+ Status GetHostName(char* name, uint64_t len) override;
+
+ Status GetThreadList(std::vector<ThreadStatus>* thread_list) override;
+
+ void Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+ void* tag, void (*unschedFunction)(void* arg)) override;
+
+ int UnSchedule(void* arg, Env::Priority pri) override;
+
+ void StartThread(void (*function)(void* arg), void* arg) override;
+
+ void WaitForJoin() override;
+
+ unsigned int GetThreadPoolQueueLen(Env::Priority pri) const override;
+
+ int ReserveThreads(int threads_to_be_reserved, Env::Priority pri) override;
+
+ int ReleaseThreads(int threads_to_be_released, Env::Priority pri) override;
+
+ uint64_t GetThreadID() const override;
+
+ // Allow increasing the number of worker threads.
+ void SetBackgroundThreads(int num, Env::Priority pri) override;
+ int GetBackgroundThreads(Env::Priority pri) override;
+
+ void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override;
+
+ private:
+ WinEnvIO winenv_io_;
+ WinEnvThreads winenv_threads_;
+};
+
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/win/io_win.cc b/src/rocksdb/port/win/io_win.cc
new file mode 100644
index 000000000..4fa735518
--- /dev/null
+++ b/src/rocksdb/port/win/io_win.cc
@@ -0,0 +1,1101 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include "port/win/io_win.h"
+
+#include "env_win.h"
+#include "monitoring/iostats_context_imp.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+/*
+ * DirectIOHelper
+ */
+namespace {
+
+const size_t kSectorSize = 512;
+
+inline bool IsPowerOfTwo(const size_t alignment) {
+ return ((alignment) & (alignment - 1)) == 0;
+}
+
+inline bool IsAligned(size_t alignment, const void* ptr) {
+ return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
+}
+} // namespace
+
+std::string GetWindowsErrSz(DWORD err) {
+ std::string Err;
+ LPSTR lpMsgBuf = nullptr;
+ FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, err,
+ 0, // Default language
+ reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
+
+ if (lpMsgBuf) {
+ Err = lpMsgBuf;
+ LocalFree(lpMsgBuf);
+ }
+ return Err;
+}
+
+// We preserve the original name of this interface to denote the original idea
+// behind it.
+// All reads happen by a specified offset and pwrite interface does not change
+// the position of the file pointer. Judging from the man page and errno it does
+// execute
+// lseek atomically to return the position of the file back where it was.
+// WriteFile() does not
+// have this capability. Therefore, for both pread and pwrite the pointer is
+// advanced to the next position
+// which is fine for writes because they are (should be) sequential.
+// Because all the reads/writes happen by the specified offset, the caller in
+// theory should not
+// rely on the current file offset.
+IOStatus pwrite(const WinFileData* file_data, const Slice& data,
+ uint64_t offset, size_t& bytes_written) {
+ IOStatus s;
+ bytes_written = 0;
+
+ size_t num_bytes = data.size();
+ if (num_bytes > std::numeric_limits<DWORD>::max()) {
+ // May happen in 64-bit builds where size_t is 64-bits but
+ // long is still 32-bit, but that's the API here at the moment
+ return IOStatus::InvalidArgument(
+ "num_bytes is too large for a single write: " + file_data->GetName());
+ }
+
+ OVERLAPPED overlapped = {0};
+ ULARGE_INTEGER offsetUnion;
+ offsetUnion.QuadPart = offset;
+
+ overlapped.Offset = offsetUnion.LowPart;
+ overlapped.OffsetHigh = offsetUnion.HighPart;
+
+ DWORD bytesWritten = 0;
+
+ if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(),
+ static_cast<DWORD>(num_bytes), &bytesWritten,
+ &overlapped)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
+ lastError);
+ } else {
+ bytes_written = bytesWritten;
+ }
+
+ return s;
+}
+
+// See comments for pwrite above
+IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
+ uint64_t offset, size_t& bytes_read) {
+ IOStatus s;
+ bytes_read = 0;
+
+ if (num_bytes > std::numeric_limits<DWORD>::max()) {
+ return IOStatus::InvalidArgument(
+ "num_bytes is too large for a single read: " + file_data->GetName());
+ }
+
+ OVERLAPPED overlapped = {0};
+ ULARGE_INTEGER offsetUnion;
+ offsetUnion.QuadPart = offset;
+
+ overlapped.Offset = offsetUnion.LowPart;
+ overlapped.OffsetHigh = offsetUnion.HighPart;
+
+ DWORD bytesRead = 0;
+
+ if (FALSE == ReadFile(file_data->GetFileHandle(), src,
+ static_cast<DWORD>(num_bytes), &bytesRead,
+ &overlapped)) {
+ auto lastError = GetLastError();
+ // EOF is OK with zero bytes read
+ if (lastError != ERROR_HANDLE_EOF) {
+ s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
+ lastError);
+ }
+ } else {
+ bytes_read = bytesRead;
+ }
+
+ return s;
+}
+
+// SetFileInformationByHandle() is capable of fast pre-allocates.
+// However, this does not change the file end position unless the file is
+// truncated and the pre-allocated space is not considered filled with zeros.
+IOStatus fallocate(const std::string& filename, HANDLE hFile,
+ uint64_t to_size) {
+ IOStatus status;
+
+ FILE_ALLOCATION_INFO alloc_info;
+ alloc_info.AllocationSize.QuadPart = to_size;
+
+ if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
+ sizeof(FILE_ALLOCATION_INFO))) {
+ auto lastError = GetLastError();
+ status = IOErrorFromWindowsError(
+ "Failed to pre-allocate space: " + filename, lastError);
+ }
+
+ return status;
+}
+
+IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
+ IOStatus status;
+
+ FILE_END_OF_FILE_INFO end_of_file;
+ end_of_file.EndOfFile.QuadPart = toSize;
+
+ if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
+ sizeof(FILE_END_OF_FILE_INFO))) {
+ auto lastError = GetLastError();
+ status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
+ lastError);
+ }
+
+ return status;
+}
+
+size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
+ size_t /*max_size*/) {
+ // Returning 0 is safe as it causes the table reader to generate a unique ID.
+ // This is suboptimal for performance as it prevents multiple table readers
+ // for the same file from sharing cached blocks. For example, if users have
+ // a low value for `max_open_files`, there can be many table readers opened
+ // for the same file.
+ //
+ // TODO: this is a temporarily solution as it is safe but not optimal for
+ // performance. For more details see discussion in
+ // https://github.com/facebook/rocksdb/pull/5844.
+ return 0;
+}
+
+WinFileData::WinFileData(const std::string& filename, HANDLE hFile,
+ bool direct_io)
+ : filename_(filename),
+ hFile_(hFile),
+ use_direct_io_(direct_io),
+ sector_size_(WinFileSystem::GetSectorSize(filename)) {}
+
+bool WinFileData::IsSectorAligned(const size_t off) const {
+ return (off & (sector_size_ - 1)) == 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// WinMmapReadableFile
+
+WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
+ HANDLE hFile, HANDLE hMap,
+ const void* mapped_region,
+ size_t length)
+ : WinFileData(fileName, hFile, false /* use_direct_io */),
+ hMap_(hMap),
+ mapped_region_(mapped_region),
+ length_(length) {}
+
+WinMmapReadableFile::~WinMmapReadableFile() {
+ BOOL ret __attribute__((__unused__));
+ ret = ::UnmapViewOfFile(mapped_region_);
+ assert(ret);
+
+ ret = ::CloseHandle(hMap_);
+ assert(ret);
+}
+
+IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n,
+ const IOOptions& /*options*/, Slice* result,
+ char* scratch,
+ IODebugContext* /*dbg*/) const {
+ IOStatus s;
+
+ if (offset > length_) {
+ *result = Slice();
+ return IOError(filename_, EINVAL);
+ } else if (offset + n > length_) {
+ n = length_ - static_cast<size_t>(offset);
+ }
+ *result = Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
+ return s;
+}
+
+IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
+ return IOStatus::OK();
+}
+
+size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(hFile_, id, max_size);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// WinMmapFile
+
+// Can only truncate or reserve to a sector size aligned if
+// used on files that are opened with Unbuffered I/O
+IOStatus WinMmapFile::TruncateFile(uint64_t toSize) {
+ return ftruncate(filename_, hFile_, toSize);
+}
+
+IOStatus WinMmapFile::UnmapCurrentRegion() {
+ IOStatus status;
+
+ if (mapped_begin_ != nullptr) {
+ if (!::UnmapViewOfFile(mapped_begin_)) {
+ status = IOErrorFromWindowsError(
+ "Failed to unmap file view: " + filename_, GetLastError());
+ }
+
+ // Move on to the next portion of the file
+ file_offset_ += view_size_;
+
+ // UnmapView automatically sends data to disk but not the metadata
+ // which is good and provides some equivalent of fdatasync() on Linux
+ // therefore, we donot need separate flag for metadata
+ mapped_begin_ = nullptr;
+ mapped_end_ = nullptr;
+ dst_ = nullptr;
+
+ last_sync_ = nullptr;
+ pending_sync_ = false;
+ }
+
+ return status;
+}
+
+IOStatus WinMmapFile::MapNewRegion(const IOOptions& options,
+ IODebugContext* dbg) {
+ IOStatus status;
+
+ assert(mapped_begin_ == nullptr);
+
+ size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
+
+ if (minDiskSize > reserved_size_) {
+ status = Allocate(file_offset_, view_size_, options, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ }
+
+ // Need to remap
+ if (hMap_ == NULL || reserved_size_ > mapping_size_) {
+ if (hMap_ != NULL) {
+ // Unmap the previous one
+ BOOL ret __attribute__((__unused__));
+ ret = ::CloseHandle(hMap_);
+ assert(ret);
+ hMap_ = NULL;
+ }
+
+ ULARGE_INTEGER mappingSize;
+ mappingSize.QuadPart = reserved_size_;
+
+ hMap_ = CreateFileMappingA(
+ hFile_,
+ NULL, // Security attributes
+ PAGE_READWRITE, // There is not a write only mode for mapping
+ mappingSize.HighPart, // Enable mapping the whole file but the actual
+ // amount mapped is determined by MapViewOfFile
+ mappingSize.LowPart,
+ NULL); // Mapping name
+
+ if (NULL == hMap_) {
+ return IOErrorFromWindowsError(
+ "WindowsMmapFile failed to create file mapping for: " + filename_,
+ GetLastError());
+ }
+
+ mapping_size_ = reserved_size_;
+ }
+
+ ULARGE_INTEGER offset;
+ offset.QuadPart = file_offset_;
+
+ // View must begin at the granularity aligned offset
+ mapped_begin_ = reinterpret_cast<char*>(
+ MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
+ view_size_, NULL));
+
+ if (!mapped_begin_) {
+ status = IOErrorFromWindowsError(
+ "WindowsMmapFile failed to map file view: " + filename_,
+ GetLastError());
+ } else {
+ mapped_end_ = mapped_begin_ + view_size_;
+ dst_ = mapped_begin_;
+ last_sync_ = mapped_begin_;
+ pending_sync_ = false;
+ }
+ return status;
+}
+
+IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
+ return fallocate(filename_, hFile_, spaceToReserve);
+}
+
+WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
+ size_t page_size, size_t allocation_granularity,
+ const FileOptions& options)
+ : WinFileData(fname, hFile, false),
+ FSWritableFile(options),
+ hMap_(NULL),
+ page_size_(page_size),
+ allocation_granularity_(allocation_granularity),
+ reserved_size_(0),
+ mapping_size_(0),
+ view_size_(0),
+ mapped_begin_(nullptr),
+ mapped_end_(nullptr),
+ dst_(nullptr),
+ last_sync_(nullptr),
+ file_offset_(0),
+ pending_sync_(false) {
+ // Allocation granularity must be obtained from GetSystemInfo() and must be
+ // a power of two.
+ assert(allocation_granularity > 0);
+ assert((allocation_granularity & (allocation_granularity - 1)) == 0);
+
+ assert(page_size > 0);
+ assert((page_size & (page_size - 1)) == 0);
+
+ // Only for memory mapped writes
+ assert(options.use_mmap_writes);
+
+ // View size must be both the multiple of allocation_granularity AND the
+ // page size and the granularity is usually a multiple of a page size.
+ const size_t viewSize =
+ 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
+ view_size_ = Roundup(viewSize, allocation_granularity_);
+}
+
+WinMmapFile::~WinMmapFile() {
+ if (hFile_) {
+ this->Close(IOOptions(), nullptr);
+ }
+}
+
+IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) {
+ const char* src = data.data();
+ size_t left = data.size();
+
+ while (left > 0) {
+ assert(mapped_begin_ <= dst_);
+ size_t avail = mapped_end_ - dst_;
+
+ if (avail == 0) {
+ IOStatus s = UnmapCurrentRegion();
+ if (s.ok()) {
+ s = MapNewRegion(options, dbg);
+ }
+
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ size_t n = std::min(left, avail);
+ memcpy(dst_, src, n);
+ dst_ += n;
+ src += n;
+ left -= n;
+ pending_sync_ = true;
+ }
+ }
+
+ // Now make sure that the last partial page is padded with zeros if needed
+ size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
+ if (bytesToPad > 0) {
+ memset(dst_, 0, bytesToPad);
+ }
+
+ return IOStatus::OK();
+}
+
+// Means Close() will properly take care of truncate
+// and it does not need any additional information
+IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) {
+ IOStatus s;
+
+ assert(NULL != hFile_);
+
+ // We truncate to the precise size so no
+ // uninitialized data at the end. SetEndOfFile
+ // which we use does not write zeros and it is good.
+ uint64_t targetSize = GetFileSize(options, dbg);
+
+ if (mapped_begin_ != nullptr) {
+ // Sync before unmapping to make sure everything
+ // is on disk and there is not a lazy writing
+ // so we are deterministic with the tests
+ Sync(options, dbg);
+ s = UnmapCurrentRegion();
+ }
+
+ if (NULL != hMap_) {
+ BOOL ret = ::CloseHandle(hMap_);
+ if (!ret && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to Close mapping for file: " + filename_, lastError);
+ }
+
+ hMap_ = NULL;
+ }
+
+ if (hFile_ != NULL) {
+ TruncateFile(targetSize);
+
+ BOOL ret = ::CloseHandle(hFile_);
+ hFile_ = NULL;
+
+ if (!ret && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to close file map handle: " + filename_, lastError);
+ }
+ }
+
+ return s;
+}
+
+IOStatus WinMmapFile::Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+// Flush only data
+IOStatus WinMmapFile::Sync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+
+ // Some writes occurred since last sync
+ if (dst_ > last_sync_) {
+ assert(mapped_begin_);
+ assert(dst_);
+ assert(dst_ > mapped_begin_);
+ assert(dst_ < mapped_end_);
+
+ size_t page_begin =
+ TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
+ size_t page_end =
+ TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
+
+ // Flush only the amount of that is a multiple of pages
+ if (!::FlushViewOfFile(mapped_begin_ + page_begin,
+ (page_end - page_begin) + page_size_)) {
+ s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
+ GetLastError());
+ } else {
+ last_sync_ = dst_;
+ }
+ }
+
+ return s;
+}
+
+/**
+ * Flush data as well as metadata to stable storage.
+ */
+IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+ IOStatus s = Sync(options, dbg);
+
+ // Flush metadata
+ if (s.ok() && pending_sync_) {
+ if (!::FlushFileBuffers(hFile_)) {
+ s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
+ GetLastError());
+ }
+ pending_sync_ = false;
+ }
+
+ return s;
+}
+
+/**
+ * Get the size of valid data in the file. This will not match the
+ * size that is returned from the filesystem because we use mmap
+ * to extend file by map_size every time.
+ */
+uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ size_t used = dst_ - mapped_begin_;
+ return file_offset_ + used;
+}
+
+IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) {
+ return IOStatus::OK();
+}
+
+IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus status;
+ TEST_KILL_RANDOM("WinMmapFile::Allocate");
+
+ // Make sure that we reserve an aligned amount of space
+ // since the reservation block size is driven outside so we want
+ // to check if we are ok with reservation here
+ size_t spaceToReserve =
+ Roundup(static_cast<size_t>(offset + len), view_size_);
+ // Nothing to do
+ if (spaceToReserve <= reserved_size_) {
+ return status;
+ }
+
+ IOSTATS_TIMER_GUARD(allocate_nanos);
+ status = PreallocateInternal(spaceToReserve);
+ if (status.ok()) {
+ reserved_size_ = spaceToReserve;
+ }
+ return status;
+}
+
+size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(hFile_, id, max_size);
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+// WinSequentialFile
+
+WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
+ const FileOptions& options)
+ : WinFileData(fname, f, options.use_direct_reads) {}
+
+WinSequentialFile::~WinSequentialFile() {
+ assert(hFile_ != INVALID_HANDLE_VALUE);
+}
+
+IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
+ Slice* result, char* scratch,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ size_t r = 0;
+
+ assert(result != nullptr);
+ if (WinFileData::use_direct_io()) {
+ return IOStatus::NotSupported("Read() does not support direct_io");
+ }
+
+ // Windows ReadFile API accepts a DWORD.
+ // While it is possible to read in a loop if n is too big
+ // it is an unlikely case.
+ if (n > std::numeric_limits<DWORD>::max()) {
+ return IOStatus::InvalidArgument("n is too big for a single ReadFile: " +
+ filename_);
+ }
+
+ DWORD bytesToRead =
+ static_cast<DWORD>(n); // cast is safe due to the check above
+ DWORD bytesRead = 0;
+ BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
+ if (ret != FALSE) {
+ r = bytesRead;
+ } else {
+ auto lastError = GetLastError();
+ if (lastError != ERROR_HANDLE_EOF) {
+ s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError);
+ }
+ }
+
+ *result = Slice(scratch, r);
+ return s;
+}
+
+IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
+ uint64_t offset,
+ size_t& bytes_read) const {
+ return pread(this, src, numBytes, offset, bytes_read);
+}
+
+IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n,
+ const IOOptions& /*opts*/,
+ Slice* result, char* scratch,
+ IODebugContext* /*dbg*/) {
+ if (!WinFileData::use_direct_io()) {
+ return IOStatus::NotSupported("This function is only used for direct_io");
+ }
+
+ assert(IsSectorAligned(static_cast<size_t>(offset)));
+ assert(IsSectorAligned(static_cast<size_t>(n)));
+
+ size_t bytes_read = 0; // out param
+ IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset,
+ bytes_read);
+ *result = Slice(scratch, bytes_read);
+ return s;
+}
+
+IOStatus WinSequentialFile::Skip(uint64_t n) {
+ // Can't handle more than signed max as SetFilePointerEx accepts a signed
+ // 64-bit integer. As such it is a highly unlikley case to have n so large.
+ if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
+ return IOStatus::InvalidArgument(
+ "n is too large for a single SetFilePointerEx() call" + filename_);
+ }
+
+ LARGE_INTEGER li;
+ li.QuadPart = static_cast<LONGLONG>(n); // cast is safe due to the check
+ // above
+ BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
+ if (ret == FALSE) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
+ lastError);
+ }
+ return IOStatus::OK();
+}
+
+IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
+ return IOStatus::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/// WinRandomAccessBase
+
+inline IOStatus WinRandomAccessImpl::PositionedReadInternal(
+ char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const {
+ return pread(file_base_, src, numBytes, offset, bytes_read);
+}
+
+inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
+ size_t alignment,
+ const FileOptions& options)
+ : file_base_(file_base),
+ alignment_(std::max(alignment, file_base->GetSectorSize())) {
+ assert(!options.use_mmap_reads);
+}
+
+inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n,
+ Slice* result,
+ char* scratch) const {
+ // Check buffer alignment
+ if (file_base_->use_direct_io()) {
+ assert(file_base_->IsSectorAligned(static_cast<size_t>(offset)));
+ assert(IsAligned(alignment_, scratch));
+ }
+
+ if (n == 0) {
+ *result = Slice(scratch, 0);
+ return IOStatus::OK();
+ }
+
+ size_t bytes_read = 0;
+ IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read);
+ *result = Slice(scratch, bytes_read);
+ return s;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// WinRandomAccessFile
+
+WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
+ size_t alignment,
+ const FileOptions& options)
+ : WinFileData(fname, hFile, options.use_direct_reads),
+ WinRandomAccessImpl(this, alignment, options) {}
+
+WinRandomAccessFile::~WinRandomAccessFile() {}
+
+IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n,
+ const IOOptions& /*options*/, Slice* result,
+ char* scratch,
+ IODebugContext* /*dbg*/) const {
+ return ReadImpl(offset, n, result, scratch);
+}
+
+IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
+ return IOStatus::OK();
+}
+
+size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
+}
+
+size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
+ return GetAlignment();
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// WinWritableImpl
+//
+
+inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
+ return fallocate(file_data_->GetName(), file_data_->GetFileHandle(),
+ spaceToReserve);
+}
+
+inline WinWritableImpl::WinWritableImpl(WinFileData* file_data,
+ size_t alignment)
+ : file_data_(file_data),
+ alignment_(std::max(alignment, file_data->GetSectorSize())),
+ next_write_offset_(0),
+ reservedsize_(0) {
+ // Query current position in case ReopenWritableFile is called
+ // This position is only important for buffered writes
+ // for unbuffered writes we explicitely specify the position.
+ LARGE_INTEGER zero_move;
+ zero_move.QuadPart = 0; // Do not move
+ LARGE_INTEGER pos;
+ pos.QuadPart = 0;
+ BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
+ FILE_CURRENT);
+ // Querying no supped to fail
+ if (ret != 0) {
+ next_write_offset_ = pos.QuadPart;
+ } else {
+ assert(false);
+ }
+}
+
+inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) {
+ IOStatus s;
+
+ if (data.size() > std::numeric_limits<DWORD>::max()) {
+ return IOStatus::InvalidArgument("data is too long for a single write" +
+ file_data_->GetName());
+ }
+
+ size_t bytes_written = 0; // out param
+
+ if (file_data_->use_direct_io()) {
+ // With no offset specified we are appending
+ // to the end of the file
+ assert(file_data_->IsSectorAligned(next_write_offset_));
+ assert(file_data_->IsSectorAligned(data.size()));
+ assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
+ s = pwrite(file_data_, data, next_write_offset_, bytes_written);
+ } else {
+ DWORD bytesWritten = 0;
+ if (!WriteFile(file_data_->GetFileHandle(), data.data(),
+ static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to WriteFile: " + file_data_->GetName(), lastError);
+ } else {
+ bytes_written = bytesWritten;
+ }
+ }
+
+ if (s.ok()) {
+ if (bytes_written == data.size()) {
+ // This matters for direct_io cases where
+ // we rely on the fact that next_write_offset_
+ // is sector aligned
+ next_write_offset_ += bytes_written;
+ } else {
+ s = IOStatus::IOError("Failed to write all bytes: " +
+ file_data_->GetName());
+ }
+ }
+
+ return s;
+}
+
+inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data,
+ uint64_t offset) {
+ if (file_data_->use_direct_io()) {
+ assert(file_data_->IsSectorAligned(static_cast<size_t>(offset)));
+ assert(file_data_->IsSectorAligned(data.size()));
+ assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
+ }
+
+ size_t bytes_written = 0;
+ IOStatus s = pwrite(file_data_, data, offset, bytes_written);
+
+ if (s.ok()) {
+ if (bytes_written == data.size()) {
+ // For sequential write this would be simple
+ // size extension by data.size()
+ uint64_t write_end = offset + bytes_written;
+ if (write_end >= next_write_offset_) {
+ next_write_offset_ = write_end;
+ }
+ } else {
+ s = IOStatus::IOError("Failed to write all of the requested data: " +
+ file_data_->GetName());
+ }
+ }
+ return s;
+}
+
+inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) {
+ // It is tempting to check for the size for sector alignment
+ // but truncation may come at the end and there is not a requirement
+ // for this to be sector aligned so long as we do not attempt to write
+ // after that. The interface docs state that the behavior is undefined
+ // in that case.
+ IOStatus s =
+ ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size);
+
+ if (s.ok()) {
+ next_write_offset_ = size;
+ }
+ return s;
+}
+
+inline IOStatus WinWritableImpl::CloseImpl() {
+ IOStatus s;
+
+ auto hFile = file_data_->GetFileHandle();
+ assert(INVALID_HANDLE_VALUE != hFile);
+
+ if (!::FlushFileBuffers(hFile)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "FlushFileBuffers failed at Close() for: " + file_data_->GetName(),
+ lastError);
+ }
+
+ if (!file_data_->CloseFile() && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "CloseHandle failed for: " + file_data_->GetName(), lastError);
+ }
+ return s;
+}
+
+inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ if (!::FlushFileBuffers(file_data_->GetFileHandle())) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(),
+ lastError);
+ }
+ return s;
+}
+
+inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
+ IOStatus status;
+ TEST_KILL_RANDOM("WinWritableFile::Allocate");
+
+ // Make sure that we reserve an aligned amount of space
+ // since the reservation block size is driven outside so we want
+ // to check if we are ok with reservation here
+ size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len),
+ static_cast<size_t>(alignment_));
+ // Nothing to do
+ if (spaceToReserve <= reservedsize_) {
+ return status;
+ }
+
+ IOSTATS_TIMER_GUARD(allocate_nanos);
+ status = PreallocateInternal(spaceToReserve);
+ if (status.ok()) {
+ reservedsize_ = spaceToReserve;
+ }
+ return status;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// WinWritableFile
+
+WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
+ size_t alignment, size_t /* capacity */,
+ const FileOptions& options)
+ : WinFileData(fname, hFile, options.use_direct_writes),
+ WinWritableImpl(this, alignment),
+ FSWritableFile(options) {
+ assert(!options.use_mmap_writes);
+}
+
+WinWritableFile::~WinWritableFile() {}
+
+// Indicates if the class makes use of direct I/O
+bool WinWritableFile::use_direct_io() const {
+ return WinFileData::use_direct_io();
+}
+
+size_t WinWritableFile::GetRequiredBufferAlignment() const {
+ return static_cast<size_t>(GetAlignment());
+}
+
+IOStatus WinWritableFile::Append(const Slice& data,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return AppendImpl(data);
+}
+
+IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return PositionedAppendImpl(data, offset);
+}
+
+// Need to implement this so the file is truncated correctly
+// when buffered and unbuffered mode
+IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return TruncateImpl(size);
+}
+
+IOStatus WinWritableFile::Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return CloseImpl();
+}
+
+// write out the cached data to the OS cache
+// This is now taken care of the WritableFileWriter
+IOStatus WinWritableFile::Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+ return SyncImpl(options, dbg);
+}
+
+IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+ return SyncImpl(options, dbg);
+}
+
+bool WinWritableFile::IsSyncThreadSafe() const { return true; }
+
+uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return GetFileNextWriteOffset();
+}
+
+IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return AllocateImpl(offset, len);
+}
+
+size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
+}
+
+/////////////////////////////////////////////////////////////////////////
+/// WinRandomRWFile
+
+WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
+ size_t alignment, const FileOptions& options)
+ : WinFileData(fname, hFile,
+ options.use_direct_reads && options.use_direct_writes),
+ WinRandomAccessImpl(this, alignment, options),
+ WinWritableImpl(this, alignment) {}
+
+bool WinRandomRWFile::use_direct_io() const {
+ return WinFileData::use_direct_io();
+}
+
+size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
+ assert(WinRandomAccessImpl::GetAlignment() ==
+ WinWritableImpl::GetAlignment());
+ return static_cast<size_t>(WinRandomAccessImpl::GetAlignment());
+}
+
+IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return PositionedAppendImpl(data, offset);
+}
+
+IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n,
+ const IOOptions& /*options*/, Slice* result,
+ char* scratch, IODebugContext* /*dbg*/) const {
+ return ReadImpl(offset, n, result, scratch);
+}
+
+IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+ return SyncImpl(options, dbg);
+}
+
+IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return CloseImpl();
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// WinMemoryMappedBufer
+WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
+ BOOL ret
+#if defined(_MSC_VER)
+ = FALSE;
+#else
+ __attribute__((__unused__));
+#endif
+ if (base_ != nullptr) {
+ ret = ::UnmapViewOfFile(base_);
+ assert(ret);
+ base_ = nullptr;
+ }
+ if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
+ ret = ::CloseHandle(map_handle_);
+ assert(ret);
+ map_handle_ = NULL;
+ }
+ if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
+ ret = ::CloseHandle(file_handle_);
+ assert(ret);
+ file_handle_ = NULL;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// WinDirectory
+
+IOStatus WinDirectory::Fsync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus WinDirectory::Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s = IOStatus::OK();
+ BOOL ret __attribute__((__unused__));
+ if (handle_ != INVALID_HANDLE_VALUE) {
+ ret = ::CloseHandle(handle_);
+ if (!ret) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("Directory closes failed for : " + GetName(),
+ lastError);
+ }
+ handle_ = NULL;
+ }
+ return s;
+}
+
+size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(handle_, id, max_size);
+}
+//////////////////////////////////////////////////////////////////////////
+/// WinFileLock
+
+WinFileLock::~WinFileLock() {
+ BOOL ret __attribute__((__unused__));
+ ret = ::CloseHandle(hFile_);
+ assert(ret);
+}
+
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/io_win.h b/src/rocksdb/port/win/io_win.h
new file mode 100644
index 000000000..a4fee8346
--- /dev/null
+++ b/src/rocksdb/port/win/io_win.h
@@ -0,0 +1,508 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <stdint.h>
+#include <windows.h>
+
+#include <mutex>
+#include <string>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/status.h"
+#include "util/aligned_buffer.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+std::string GetWindowsErrSz(DWORD err);
+
+inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) {
+ return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL))
+ ? IOStatus::NoSpace(context, GetWindowsErrSz(err))
+ : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND))
+ ? IOStatus::PathNotFound(context, GetWindowsErrSz(err))
+ : IOStatus::IOError(context, GetWindowsErrSz(err));
+}
+
+inline IOStatus IOErrorFromLastWindowsError(const std::string& context) {
+ return IOErrorFromWindowsError(context, GetLastError());
+}
+
+inline IOStatus IOError(const std::string& context, int err_number) {
+ return (err_number == ENOSPC)
+ ? IOStatus::NoSpace(context, errnoStr(err_number).c_str())
+ : (err_number == ENOENT)
+ ? IOStatus::PathNotFound(context, errnoStr(err_number).c_str())
+ : IOStatus::IOError(context, errnoStr(err_number).c_str());
+}
+
+class WinFileData;
+
+IOStatus pwrite(const WinFileData* file_data, const Slice& data,
+ uint64_t offset, size_t& bytes_written);
+
+IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
+ uint64_t offset, size_t& bytes_read);
+
+IOStatus fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
+
+IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize);
+
+size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size);
+
+class WinFileData {
+ protected:
+ const std::string filename_;
+ HANDLE hFile_;
+ // If true, the I/O issued would be direct I/O which the buffer
+ // will need to be aligned (not sure there is a guarantee that the buffer
+ // passed in is aligned).
+ const bool use_direct_io_;
+ const size_t sector_size_;
+
+ public:
+ // We want this class be usable both for inheritance (prive
+ // or protected) and for containment so __ctor and __dtor public
+ WinFileData(const std::string& filename, HANDLE hFile, bool direct_io);
+
+ virtual ~WinFileData() { this->CloseFile(); }
+
+ bool CloseFile() {
+ bool result = true;
+
+ if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
+ result = ::CloseHandle(hFile_);
+ assert(result);
+ hFile_ = NULL;
+ }
+ return result;
+ }
+
+ const std::string& GetName() const { return filename_; }
+
+ HANDLE GetFileHandle() const { return hFile_; }
+
+ bool use_direct_io() const { return use_direct_io_; }
+
+ size_t GetSectorSize() const { return sector_size_; }
+
+ bool IsSectorAligned(const size_t off) const;
+
+ WinFileData(const WinFileData&) = delete;
+ WinFileData& operator=(const WinFileData&) = delete;
+};
+
+class WinSequentialFile : protected WinFileData, public FSSequentialFile {
+ // Override for behavior change when creating a custom env
+ virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
+ uint64_t offset,
+ size_t& bytes_read) const;
+
+ public:
+ WinSequentialFile(const std::string& fname, HANDLE f,
+ const FileOptions& options);
+
+ ~WinSequentialFile();
+
+ WinSequentialFile(const WinSequentialFile&) = delete;
+ WinSequentialFile& operator=(const WinSequentialFile&) = delete;
+
+ IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+ char* scratch, IODebugContext* dbg) override;
+ IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) override;
+
+ IOStatus Skip(uint64_t n) override;
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ virtual bool use_direct_io() const override {
+ return WinFileData::use_direct_io();
+ }
+};
+
+// mmap() based random-access
+class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile {
+ HANDLE hMap_;
+
+ const void* mapped_region_;
+ const size_t length_;
+
+ public:
+ // mapped_region_[0,length-1] contains the mmapped contents of the file.
+ WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap,
+ const void* mapped_region, size_t length);
+
+ ~WinMmapReadableFile();
+
+ WinMmapReadableFile(const WinMmapReadableFile&) = delete;
+ WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete;
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+};
+
+// We preallocate and use memcpy to append new
+// data to the file. This is safe since we either properly close the
+// file before reading from it, or for log files, the reading code
+// knows enough to skip zero suffixes.
+class WinMmapFile : private WinFileData, public FSWritableFile {
+ private:
+ HANDLE hMap_;
+
+ const size_t page_size_; // We flush the mapping view in page_size
+ // increments. We may decide if this is a memory
+ // page size or SSD page size
+ const size_t
+ allocation_granularity_; // View must start at such a granularity
+
+ size_t reserved_size_; // Preallocated size
+
+ size_t mapping_size_; // The max size of the mapping object
+ // we want to guess the final file size to minimize the remapping
+ size_t view_size_; // How much memory to map into a view at a time
+
+ char* mapped_begin_; // Must begin at the file offset that is aligned with
+ // allocation_granularity_
+ char* mapped_end_;
+ char* dst_; // Where to write next (in range [mapped_begin_,mapped_end_])
+ char* last_sync_; // Where have we synced up to
+
+ uint64_t file_offset_; // Offset of mapped_begin_ in file
+
+ // Do we have unsynced writes?
+ bool pending_sync_;
+
+ // Can only truncate or reserve to a sector size aligned if
+ // used on files that are opened with Unbuffered I/O
+ IOStatus TruncateFile(uint64_t toSize);
+
+ IOStatus UnmapCurrentRegion();
+
+ IOStatus MapNewRegion(const IOOptions& options, IODebugContext* dbg);
+
+ virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
+
+ public:
+ WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
+ size_t allocation_granularity, const FileOptions& options);
+
+ ~WinMmapFile();
+
+ WinMmapFile(const WinMmapFile&) = delete;
+ WinMmapFile& operator=(const WinMmapFile&) = delete;
+
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus Append(const Slice& data, const IOOptions& opts,
+ const DataVerificationInfo& /* verification_info */,
+ IODebugContext* dbg) override {
+ return Append(data, opts, dbg);
+ }
+
+ // Means Close() will properly take care of truncate
+ // and it does not need any additional information
+ IOStatus Truncate(uint64_t size, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+ // Flush only data
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+ /**
+ * Flush data as well as metadata to stable storage.
+ */
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+ /**
+ * Get the size of valid data in the file. This will not match the
+ * size that is returned from the filesystem because we use mmap
+ * to extend file by map_size every time.
+ */
+ uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+};
+
+class WinRandomAccessImpl {
+ protected:
+ WinFileData* file_base_;
+ size_t alignment_;
+
+ // Override for behavior change when creating a custom env
+ virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
+ uint64_t offset,
+ size_t& bytes_read) const;
+
+ WinRandomAccessImpl(WinFileData* file_base, size_t alignment,
+ const FileOptions& options);
+
+ virtual ~WinRandomAccessImpl() {}
+
+ IOStatus ReadImpl(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const;
+
+ size_t GetAlignment() const { return alignment_; }
+
+ public:
+ WinRandomAccessImpl(const WinRandomAccessImpl&) = delete;
+ WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete;
+};
+
+// pread() based random-access
+class WinRandomAccessFile
+ : private WinFileData,
+ protected WinRandomAccessImpl, // Want to be able to override
+ // PositionedReadInternal
+ public FSRandomAccessFile {
+ public:
+ WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
+ const FileOptions& options);
+
+ ~WinRandomAccessFile();
+
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+
+ virtual bool use_direct_io() const override {
+ return WinFileData::use_direct_io();
+ }
+
+ IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ virtual size_t GetRequiredBufferAlignment() const override;
+};
+
+// This is a sequential write class. It has been mimicked (as others) after
+// the original Posix class. We add support for unbuffered I/O on windows as
+// well
+// we utilize the original buffer as an alignment buffer to write directly to
+// file with no buffering.
+// No buffering requires that the provided buffer is aligned to the physical
+// sector size (SSD page size) and
+// that all SetFilePointer() operations to occur with such an alignment.
+// We thus always write in sector/page size increments to the drive and leave
+// the tail for the next write OR for Close() at which point we pad with zeros.
+// No padding is required for
+// buffered access.
+class WinWritableImpl {
+ protected:
+ WinFileData* file_data_;
+ const uint64_t alignment_;
+ uint64_t
+ next_write_offset_; // Needed because Windows does not support O_APPEND
+ uint64_t reservedsize_; // how far we have reserved space
+
+ virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
+
+ WinWritableImpl(WinFileData* file_data, size_t alignment);
+
+ ~WinWritableImpl() {}
+
+ uint64_t GetAlignment() const { return alignment_; }
+
+ IOStatus AppendImpl(const Slice& data);
+
+ // Requires that the data is aligned as specified by
+ // GetRequiredBufferAlignment()
+ IOStatus PositionedAppendImpl(const Slice& data, uint64_t offset);
+
+ IOStatus TruncateImpl(uint64_t size);
+
+ IOStatus CloseImpl();
+
+ IOStatus SyncImpl(const IOOptions& options, IODebugContext* dbg);
+
+ uint64_t GetFileNextWriteOffset() {
+ // Double accounting now here with WritableFileWriter
+ // and this size will be wrong when unbuffered access is used
+ // but tests implement their own writable files and do not use
+ // WritableFileWrapper
+ // so we need to squeeze a square peg through
+ // a round hole here.
+ return next_write_offset_;
+ }
+
+ IOStatus AllocateImpl(uint64_t offset, uint64_t len);
+
+ public:
+ WinWritableImpl(const WinWritableImpl&) = delete;
+ WinWritableImpl& operator=(const WinWritableImpl&) = delete;
+};
+
+class WinWritableFile : private WinFileData,
+ protected WinWritableImpl,
+ public FSWritableFile {
+ public:
+ WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
+ size_t capacity, const FileOptions& options);
+
+ ~WinWritableFile();
+
+ IOStatus Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus Append(const Slice& data, const IOOptions& opts,
+ const DataVerificationInfo& /* verification_info */,
+ IODebugContext* dbg) override {
+ return Append(data, opts, dbg);
+ }
+
+ // Requires that the data is aligned as specified by
+ // GetRequiredBufferAlignment()
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& options,
+ IODebugContext* dbg) override;
+ IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& opts,
+ const DataVerificationInfo& /* verification_info */,
+ IODebugContext* dbg) override {
+ return PositionedAppend(data, offset, opts, dbg);
+ }
+
+ // Need to implement this so the file is truncated correctly
+ // when buffered and unbuffered mode
+ IOStatus Truncate(uint64_t size, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+ // write out the cached data to the OS cache
+ // This is now taken care of the WritableFileWriter
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+ virtual bool IsSyncThreadSafe() const override;
+
+ // Indicates if the class makes use of direct I/O
+ // Use PositionedAppend
+ virtual bool use_direct_io() const override;
+
+ virtual size_t GetRequiredBufferAlignment() const override;
+
+ uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+};
+
+class WinRandomRWFile : private WinFileData,
+ protected WinRandomAccessImpl,
+ protected WinWritableImpl,
+ public FSRandomRWFile {
+ public:
+ WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment,
+ const FileOptions& options);
+
+ ~WinRandomRWFile() {}
+
+ // Indicates if the class makes use of direct I/O
+ // If false you must pass aligned buffer to Write()
+ virtual bool use_direct_io() const override;
+
+ // Use the returned alignment value to allocate aligned
+ // buffer for Write() when use_direct_io() returns true
+ virtual size_t GetRequiredBufferAlignment() const override;
+
+ // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
+ // Pass aligned buffer when use_direct_io() returns true.
+ IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) override;
+
+ // Read up to `n` bytes starting from offset `offset` and store them in
+ // result, provided `scratch` size should be at least `n`.
+ // Returns Status::OK() on success.
+ IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+ Slice* result, char* scratch,
+ IODebugContext* dbg) const override;
+
+ IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+ return Sync(options, dbg);
+ }
+
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+};
+
+class WinMemoryMappedBuffer : public MemoryMappedFileBuffer {
+ private:
+ HANDLE file_handle_;
+ HANDLE map_handle_;
+
+ public:
+ WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base,
+ size_t size)
+ : MemoryMappedFileBuffer(base, size),
+ file_handle_(file_handle),
+ map_handle_(map_handle) {}
+ ~WinMemoryMappedBuffer() override;
+};
+
+class WinDirectory : public FSDirectory {
+ const std::string filename_;
+ HANDLE handle_;
+
+ public:
+ explicit WinDirectory(const std::string& filename, HANDLE h) noexcept
+ : filename_(filename), handle_(h) {
+ assert(handle_ != INVALID_HANDLE_VALUE);
+ }
+ ~WinDirectory() {
+ if (handle_ != NULL) {
+ IOStatus s = WinDirectory::Close(IOOptions(), nullptr);
+ s.PermitUncheckedError();
+ }
+ }
+ const std::string& GetName() const { return filename_; }
+ IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+ size_t GetUniqueId(char* id, size_t max_size) const override;
+};
+
+class WinFileLock : public FileLock {
+ public:
+ explicit WinFileLock(HANDLE hFile) : hFile_(hFile) {
+ assert(hFile != NULL);
+ assert(hFile != INVALID_HANDLE_VALUE);
+ }
+
+ ~WinFileLock();
+
+ private:
+ HANDLE hFile_;
+};
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/win/port_win.cc b/src/rocksdb/port/win/port_win.cc
new file mode 100644
index 000000000..37e8f655c
--- /dev/null
+++ b/src/rocksdb/port/win/port_win.cc
@@ -0,0 +1,303 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include "port/win/port_win.h"
+
+#include <assert.h>
+#include <io.h>
+#include <rpc.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <chrono>
+#include <cstdlib>
+#include <exception>
+#include <memory>
+
+#include "port/port_dirent.h"
+#include "port/sys_time.h"
+
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+// utf8 <-> utf16
+#include <codecvt>
+#include <locale>
+#include <string>
+#endif
+
+#include "logging/logging.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const bool kDefaultToAdaptiveMutex = false;
+
+namespace port {
+
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+std::string utf16_to_utf8(const std::wstring& utf16) {
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> convert;
+ return convert.to_bytes(utf16);
+}
+
+std::wstring utf8_to_utf16(const std::string& utf8) {
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+ return converter.from_bytes(utf8);
+}
+#endif
+
+void GetTimeOfDay(TimeVal* tv, struct timezone* /* tz */) {
+ std::chrono::microseconds usNow(
+ std::chrono::duration_cast<std::chrono::microseconds>(
+ std::chrono::system_clock::now().time_since_epoch()));
+
+ std::chrono::seconds secNow(
+ std::chrono::duration_cast<std::chrono::seconds>(usNow));
+
+ tv->tv_sec = static_cast<long>(secNow.count());
+ tv->tv_usec = static_cast<long>(
+ usNow.count() -
+ std::chrono::duration_cast<std::chrono::microseconds>(secNow).count());
+}
+
+Mutex::~Mutex() {}
+
+CondVar::~CondVar() {}
+
+void CondVar::Wait() {
+ // Caller must ensure that mutex is held prior to calling this method
+ std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
+#ifndef NDEBUG
+ mu_->locked_ = false;
+#endif
+ cv_.wait(lk);
+#ifndef NDEBUG
+ mu_->locked_ = true;
+#endif
+ // Release ownership of the lock as we don't want it to be unlocked when
+ // it goes out of scope (as we adopted the lock and didn't lock it ourselves)
+ lk.release();
+}
+
+bool CondVar::TimedWait(uint64_t abs_time_us) {
+ // MSVC++ library implements wait_until in terms of wait_for so
+ // we need to convert absolute wait into relative wait.
+ std::chrono::microseconds usAbsTime(abs_time_us);
+
+ std::chrono::microseconds usNow(
+ std::chrono::duration_cast<std::chrono::microseconds>(
+ std::chrono::system_clock::now().time_since_epoch()));
+ std::chrono::microseconds relTimeUs = (usAbsTime > usNow)
+ ? (usAbsTime - usNow)
+ : std::chrono::microseconds::zero();
+
+ // Caller must ensure that mutex is held prior to calling this method
+ std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
+
+ // Work around https://github.com/microsoft/STL/issues/369
+#if defined(_MSC_VER) && \
+ (!defined(_MSVC_STL_UPDATE) || _MSVC_STL_UPDATE < 202008L)
+ if (relTimeUs == std::chrono::microseconds::zero()) {
+ lk.unlock();
+ lk.lock();
+ }
+#endif
+#ifndef NDEBUG
+ mu_->locked_ = false;
+#endif
+ std::cv_status cvStatus = cv_.wait_for(lk, relTimeUs);
+#ifndef NDEBUG
+ mu_->locked_ = true;
+#endif
+ // Release ownership of the lock as we don't want it to be unlocked when
+ // it goes out of scope (as we adopted the lock and didn't lock it ourselves)
+ lk.release();
+
+ if (cvStatus == std::cv_status::timeout) {
+ return true;
+ }
+
+ return false;
+}
+
+void CondVar::Signal() { cv_.notify_one(); }
+
+void CondVar::SignalAll() { cv_.notify_all(); }
+
+int PhysicalCoreID() { return GetCurrentProcessorNumber(); }
+
+void InitOnce(OnceType* once, void (*initializer)()) {
+ std::call_once(once->flag_, initializer);
+}
+
+// Private structure, exposed only by pointer
+struct DIR {
+ HANDLE handle_;
+ bool firstread_;
+ RX_WIN32_FIND_DATA data_;
+ dirent entry_;
+
+ DIR() : handle_(INVALID_HANDLE_VALUE), firstread_(true) {}
+
+ DIR(const DIR&) = delete;
+ DIR& operator=(const DIR&) = delete;
+
+ ~DIR() {
+ if (INVALID_HANDLE_VALUE != handle_) {
+ ::FindClose(handle_);
+ }
+ }
+};
+
+DIR* opendir(const char* name) {
+ if (!name || *name == 0) {
+ errno = ENOENT;
+ return nullptr;
+ }
+
+ std::string pattern(name);
+ pattern.append("\\").append("*");
+
+ std::unique_ptr<DIR> dir(new DIR);
+
+ dir->handle_ =
+ RX_FindFirstFileEx(RX_FN(pattern).c_str(),
+ FindExInfoBasic, // Do not want alternative name
+ &dir->data_, FindExSearchNameMatch,
+ NULL, // lpSearchFilter
+ 0);
+
+ if (dir->handle_ == INVALID_HANDLE_VALUE) {
+ return nullptr;
+ }
+
+ RX_FILESTRING x(dir->data_.cFileName, RX_FNLEN(dir->data_.cFileName));
+ strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), FN_TO_RX(x).c_str());
+
+ return dir.release();
+}
+
+struct dirent* readdir(DIR* dirp) {
+ if (!dirp || dirp->handle_ == INVALID_HANDLE_VALUE) {
+ errno = EBADF;
+ return nullptr;
+ }
+
+ if (dirp->firstread_) {
+ dirp->firstread_ = false;
+ return &dirp->entry_;
+ }
+
+ auto ret = RX_FindNextFile(dirp->handle_, &dirp->data_);
+
+ if (ret == 0) {
+ return nullptr;
+ }
+
+ RX_FILESTRING x(dirp->data_.cFileName, RX_FNLEN(dirp->data_.cFileName));
+ strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name),
+ FN_TO_RX(x).c_str());
+
+ return &dirp->entry_;
+}
+
+int closedir(DIR* dirp) {
+ delete dirp;
+ return 0;
+}
+
+int truncate(const char* path, int64_t length) {
+ if (path == nullptr) {
+ errno = EFAULT;
+ return -1;
+ }
+ return ROCKSDB_NAMESPACE::port::Truncate(path, length);
+}
+
+int Truncate(std::string path, int64_t len) {
+ if (len < 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ HANDLE hFile =
+ RX_CreateFile(RX_FN(path).c_str(), GENERIC_READ | GENERIC_WRITE,
+ FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+ NULL, // Security attrs
+ OPEN_EXISTING, // Truncate existing file only
+ FILE_ATTRIBUTE_NORMAL, NULL);
+
+ if (INVALID_HANDLE_VALUE == hFile) {
+ auto lastError = GetLastError();
+ if (lastError == ERROR_FILE_NOT_FOUND) {
+ errno = ENOENT;
+ } else if (lastError == ERROR_ACCESS_DENIED) {
+ errno = EACCES;
+ } else {
+ errno = EIO;
+ }
+ return -1;
+ }
+
+ int result = 0;
+ FILE_END_OF_FILE_INFO end_of_file;
+ end_of_file.EndOfFile.QuadPart = len;
+
+ if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
+ sizeof(FILE_END_OF_FILE_INFO))) {
+ errno = EIO;
+ result = -1;
+ }
+
+ CloseHandle(hFile);
+ return result;
+}
+
+void Crash(const std::string& srcfile, int srcline) {
+ fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
+ fflush(stdout);
+ abort();
+}
+
+int GetMaxOpenFiles() { return -1; }
+
+// Assume 4KB page size
+const size_t kPageSize = 4U * 1024U;
+
+void SetCpuPriority(ThreadId id, CpuPriority priority) {
+ // Not supported
+ (void)id;
+ (void)priority;
+}
+
+int64_t GetProcessID() { return GetCurrentProcessId(); }
+
+bool GenerateRfcUuid(std::string* output) {
+ UUID uuid;
+ UuidCreateSequential(&uuid);
+
+ RPC_CSTR rpc_str;
+ auto status = UuidToStringA(&uuid, &rpc_str);
+ if (status != RPC_S_OK) {
+ return false;
+ }
+
+ // rpc_str is nul-terminated
+ *output = reinterpret_cast<char*>(rpc_str);
+
+ status = RpcStringFreeA(&rpc_str);
+ assert(status == RPC_S_OK);
+
+ return true;
+}
+
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/port_win.h b/src/rocksdb/port/win/port_win.h
new file mode 100644
index 000000000..989b5620b
--- /dev/null
+++ b/src/rocksdb/port/win/port_win.h
@@ -0,0 +1,378 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#pragma once
+
+// Always want minimum headers
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
+#include <string>
+#include <thread>
+#include <string.h>
+#include <mutex>
+#include <limits>
+#include <condition_variable>
+#include <malloc.h>
+#include <intrin.h>
+#include <process.h>
+
+#include <stdint.h>
+
+#include "port/win/win_thread.h"
+
+#include "rocksdb/options.h"
+
+#undef min
+#undef max
+#undef DeleteFile
+#undef GetCurrentTime
+
+#ifndef strcasecmp
+#define strcasecmp _stricmp
+#endif
+
+#undef GetCurrentTime
+#undef DeleteFile
+
+#ifndef _SSIZE_T_DEFINED
+using ssize_t = SSIZE_T;
+#endif
+
+// size_t printf formatting named in the manner of C99 standard formatting
+// strings such as PRIu64
+// in fact, we could use that one
+#ifndef ROCKSDB_PRIszt
+#define ROCKSDB_PRIszt "Iu"
+#endif
+
+#ifdef _MSC_VER
+#define __attribute__(A)
+
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+#define PREFETCH(addr, rw, locality)
+
+extern const bool kDefaultToAdaptiveMutex;
+
+namespace port {
+
+// "Windows is designed to run on little-endian computer architectures."
+// https://docs.microsoft.com/en-us/windows/win32/sysinfo/registry-value-types
+constexpr bool kLittleEndian = true;
+#undef PLATFORM_IS_LITTLE_ENDIAN
+
+class CondVar;
+
+class Mutex {
+ public:
+ static const char* kName() { return "std::mutex"; }
+
+ explicit Mutex(bool IGNORED_adaptive = kDefaultToAdaptiveMutex)
+#ifndef NDEBUG
+ : locked_(false)
+#endif
+ {
+ (void)IGNORED_adaptive;
+ }
+
+ ~Mutex();
+
+ void Lock() {
+ mutex_.lock();
+#ifndef NDEBUG
+ locked_ = true;
+#endif
+ }
+
+ void Unlock() {
+#ifndef NDEBUG
+ locked_ = false;
+#endif
+ mutex_.unlock();
+ }
+
+ bool TryLock() {
+ bool ret = mutex_.try_lock();
+#ifndef NDEBUG
+ if (ret) {
+ locked_ = true;
+ }
+#endif
+ return ret;
+ }
+
+ // this will assert if the mutex is not locked
+ // it does NOT verify that mutex is held by a calling thread
+ void AssertHeld() {
+#ifndef NDEBUG
+ assert(locked_);
+#endif
+ }
+
+ // Also implement std Lockable
+ inline void lock() { Lock(); }
+ inline void unlock() { Unlock(); }
+ inline bool try_lock() { return TryLock(); }
+
+ // Mutex is move only with lock ownership transfer
+ Mutex(const Mutex&) = delete;
+ void operator=(const Mutex&) = delete;
+
+ private:
+ friend class CondVar;
+
+ std::mutex& getLock() { return mutex_; }
+
+ std::mutex mutex_;
+#ifndef NDEBUG
+ bool locked_;
+#endif
+};
+
+class RWMutex {
+ public:
+ RWMutex() { InitializeSRWLock(&srwLock_); }
+ // No copying allowed
+ RWMutex(const RWMutex&) = delete;
+ void operator=(const RWMutex&) = delete;
+
+ void ReadLock() { AcquireSRWLockShared(&srwLock_); }
+
+ void WriteLock() { AcquireSRWLockExclusive(&srwLock_); }
+
+ void ReadUnlock() { ReleaseSRWLockShared(&srwLock_); }
+
+ void WriteUnlock() { ReleaseSRWLockExclusive(&srwLock_); }
+
+ // Empty as in POSIX
+ void AssertHeld() {}
+
+ private:
+ SRWLOCK srwLock_;
+};
+
+class CondVar {
+ public:
+ explicit CondVar(Mutex* mu) : mu_(mu) {}
+
+ ~CondVar();
+ void Wait();
+ bool TimedWait(uint64_t expiration_time);
+ void Signal();
+ void SignalAll();
+
+ // Condition var is not copy/move constructible
+ CondVar(const CondVar&) = delete;
+ CondVar& operator=(const CondVar&) = delete;
+
+ CondVar(CondVar&&) = delete;
+ CondVar& operator=(CondVar&&) = delete;
+
+ private:
+ std::condition_variable cv_;
+ Mutex* mu_;
+};
+
+#ifdef _POSIX_THREADS
+using Thread = std::thread;
+#else
+// Wrapper around the platform efficient
+// or otherwise preferrable implementation
+using Thread = WindowsThread;
+#endif
+
+// OnceInit type helps emulate
+// Posix semantics with initialization
+// adopted in the project
+struct OnceType {
+ struct Init {};
+
+ OnceType() {}
+ OnceType(const Init&) {}
+ OnceType(const OnceType&) = delete;
+ OnceType& operator=(const OnceType&) = delete;
+
+ std::once_flag flag_;
+};
+
+#define LEVELDB_ONCE_INIT port::OnceType::Init()
+extern void InitOnce(OnceType* once, void (*initializer)());
+
+#ifndef CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE 64U
+#endif
+
+#ifdef ROCKSDB_JEMALLOC
+// Separate inlines so they can be replaced if needed
+void* jemalloc_aligned_alloc(size_t size, size_t alignment) noexcept;
+void jemalloc_aligned_free(void* p) noexcept;
+#endif
+
+inline void* cacheline_aligned_alloc(size_t size) {
+#ifdef ROCKSDB_JEMALLOC
+ return jemalloc_aligned_alloc(size, CACHE_LINE_SIZE);
+#else
+ return _aligned_malloc(size, CACHE_LINE_SIZE);
+#endif
+}
+
+inline void cacheline_aligned_free(void* memblock) {
+#ifdef ROCKSDB_JEMALLOC
+ jemalloc_aligned_free(memblock);
+#else
+ _aligned_free(memblock);
+#endif
+}
+
+extern const size_t kPageSize;
+
+// Part of C++11
+#define ALIGN_AS(n) alignas(n)
+
+static inline void AsmVolatilePause() {
+#if defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM)
+ YieldProcessor();
+#endif
+ // it would be nice to get "wfe" on ARM here
+}
+
+extern int PhysicalCoreID();
+
+// For Thread Local Storage abstraction
+using pthread_key_t = DWORD;
+
+inline int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) {
+ // Not used
+ (void)destructor;
+
+ pthread_key_t k = TlsAlloc();
+ if (TLS_OUT_OF_INDEXES == k) {
+ return ENOMEM;
+ }
+
+ *key = k;
+ return 0;
+}
+
+inline int pthread_key_delete(pthread_key_t key) {
+ if (!TlsFree(key)) {
+ return EINVAL;
+ }
+ return 0;
+}
+
+inline int pthread_setspecific(pthread_key_t key, const void* value) {
+ if (!TlsSetValue(key, const_cast<void*>(value))) {
+ return ENOMEM;
+ }
+ return 0;
+}
+
+inline void* pthread_getspecific(pthread_key_t key) {
+ void* result = TlsGetValue(key);
+ if (!result) {
+ if (GetLastError() != ERROR_SUCCESS) {
+ errno = EINVAL;
+ } else {
+ errno = NOERROR;
+ }
+ }
+ return result;
+}
+
+// UNIX equiv although errno numbers will be off
+// using C-runtime to implement. Note, this does not
+// feel space with zeros in case the file is extended.
+int truncate(const char* path, int64_t length);
+int Truncate(std::string path, int64_t length);
+void Crash(const std::string& srcfile, int srcline);
+extern int GetMaxOpenFiles();
+std::string utf16_to_utf8(const std::wstring& utf16);
+std::wstring utf8_to_utf16(const std::string& utf8);
+
+using ThreadId = int;
+
+extern void SetCpuPriority(ThreadId id, CpuPriority priority);
+
+int64_t GetProcessID();
+
+// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns
+// true on success or false on failure.
+bool GenerateRfcUuid(std::string* output);
+
+} // namespace port
+
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+
+#define RX_FILESTRING std::wstring
+#define RX_FN(a) ROCKSDB_NAMESPACE::port::utf8_to_utf16(a)
+#define FN_TO_RX(a) ROCKSDB_NAMESPACE::port::utf16_to_utf8(a)
+#define RX_FNCMP(a, b) ::wcscmp(a, RX_FN(b).c_str())
+#define RX_FNLEN(a) ::wcslen(a)
+
+#define RX_DeleteFile DeleteFileW
+#define RX_CreateFile CreateFileW
+#define RX_CreateFileMapping CreateFileMappingW
+#define RX_GetFileAttributesEx GetFileAttributesExW
+#define RX_FindFirstFileEx FindFirstFileExW
+#define RX_FindNextFile FindNextFileW
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATAW
+#define RX_CreateDirectory CreateDirectoryW
+#define RX_RemoveDirectory RemoveDirectoryW
+#define RX_GetFileAttributesEx GetFileAttributesExW
+#define RX_MoveFileEx MoveFileExW
+#define RX_CreateHardLink CreateHardLinkW
+#define RX_PathIsRelative PathIsRelativeW
+#define RX_GetCurrentDirectory GetCurrentDirectoryW
+#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExW
+#define RX_PathIsDirectory PathIsDirectoryW
+
+#else
+
+#define RX_FILESTRING std::string
+#define RX_FN(a) a
+#define FN_TO_RX(a) a
+#define RX_FNCMP(a, b) strcmp(a, b)
+#define RX_FNLEN(a) strlen(a)
+
+#define RX_DeleteFile DeleteFileA
+#define RX_CreateFile CreateFileA
+#define RX_CreateFileMapping CreateFileMappingA
+#define RX_GetFileAttributesEx GetFileAttributesExA
+#define RX_FindFirstFileEx FindFirstFileExA
+#define RX_CreateDirectory CreateDirectoryA
+#define RX_FindNextFile FindNextFileA
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATAA
+#define RX_CreateDirectory CreateDirectoryA
+#define RX_RemoveDirectory RemoveDirectoryA
+#define RX_GetFileAttributesEx GetFileAttributesExA
+#define RX_MoveFileEx MoveFileExA
+#define RX_CreateHardLink CreateHardLinkA
+#define RX_PathIsRelative PathIsRelativeA
+#define RX_GetCurrentDirectory GetCurrentDirectoryA
+#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExA
+#define RX_PathIsDirectory PathIsDirectoryA
+
+#endif
+
+using port::pthread_getspecific;
+using port::pthread_key_create;
+using port::pthread_key_delete;
+using port::pthread_key_t;
+using port::pthread_setspecific;
+using port::truncate;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/win/win_jemalloc.cc b/src/rocksdb/port/win/win_jemalloc.cc
new file mode 100644
index 000000000..cf38f55b7
--- /dev/null
+++ b/src/rocksdb/port/win/win_jemalloc.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#ifndef ROCKSDB_JEMALLOC
+#error This file can only be part of jemalloc aware build
+#endif
+
+#include <stdexcept>
+
+#include "jemalloc/jemalloc.h"
+#include "port/win/port_win.h"
+
+#if defined(ZSTD) && defined(ZSTD_STATIC_LINKING_ONLY)
+#include <zstd.h>
+#if (ZSTD_VERSION_NUMBER >= 500)
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+void* JemallocAllocateForZSTD(void* /* opaque */, size_t size) {
+ return je_malloc(size);
+}
+void JemallocDeallocateForZSTD(void* /* opaque */, void* address) {
+ je_free(address);
+}
+ZSTD_customMem GetJeZstdAllocationOverrides() {
+ return {JemallocAllocateForZSTD, JemallocDeallocateForZSTD, nullptr};
+}
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+#endif // (ZSTD_VERSION_NUMBER >= 500)
+#endif // defined(ZSTD) defined(ZSTD_STATIC_LINKING_ONLY)
+
+// Global operators to be replaced by a linker when this file is
+// a part of the build
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+void* jemalloc_aligned_alloc(size_t size, size_t alignment) noexcept {
+ return je_aligned_alloc(alignment, size);
+}
+void jemalloc_aligned_free(void* p) noexcept { je_free(p); }
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+void* operator new(size_t size) {
+ void* p = je_malloc(size);
+ if (!p) {
+ throw std::bad_alloc();
+ }
+ return p;
+}
+
+void* operator new[](size_t size) {
+ void* p = je_malloc(size);
+ if (!p) {
+ throw std::bad_alloc();
+ }
+ return p;
+}
+
+void operator delete(void* p) {
+ if (p) {
+ je_free(p);
+ }
+}
+
+void operator delete[](void* p) {
+ if (p) {
+ je_free(p);
+ }
+}
+
+#endif
diff --git a/src/rocksdb/port/win/win_logger.cc b/src/rocksdb/port/win/win_logger.cc
new file mode 100644
index 000000000..072ea419a
--- /dev/null
+++ b/src/rocksdb/port/win/win_logger.cc
@@ -0,0 +1,192 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#if defined(OS_WIN)
+
+#include "port/win/win_logger.h"
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <algorithm>
+#include <atomic>
+
+#include "monitoring/iostats_context_imp.h"
+#include "port/sys_time.h"
+#include "port/win/env_win.h"
+#include "port/win/io_win.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace port {
+
+WinLogger::WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file,
+ const InfoLogLevel log_level)
+ : Logger(log_level),
+ file_(file),
+ gettid_(gettid),
+ log_size_(0),
+ last_flush_micros_(0),
+ clock_(clock),
+ flush_pending_(false) {
+ assert(file_ != NULL);
+ assert(file_ != INVALID_HANDLE_VALUE);
+}
+
+void WinLogger::DebugWriter(const char* str, int len) {
+ assert(file_ != INVALID_HANDLE_VALUE);
+ DWORD bytesWritten = 0;
+ BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL);
+ if (ret == FALSE) {
+ std::string errSz = GetWindowsErrSz(GetLastError());
+ fprintf(stderr, "%s", errSz.c_str());
+ }
+}
+
+WinLogger::~WinLogger() { CloseInternal().PermitUncheckedError(); }
+
+Status WinLogger::CloseImpl() { return CloseInternal(); }
+
+Status WinLogger::CloseInternal() {
+ Status s;
+ if (INVALID_HANDLE_VALUE != file_) {
+ BOOL ret = FlushFileBuffers(file_);
+ if (ret == 0) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError);
+ }
+ ret = CloseHandle(file_);
+ // On error the return value is zero
+ if (ret == 0 && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError);
+ }
+ file_ = INVALID_HANDLE_VALUE;
+ closed_ = true;
+ }
+ return s;
+}
+
+void WinLogger::Flush() {
+ assert(file_ != INVALID_HANDLE_VALUE);
+ if (flush_pending_) {
+ flush_pending_ = false;
+ // With Windows API writes go to OS buffers directly so no fflush needed
+ // unlike with C runtime API. We don't flush all the way to disk
+ // for perf reasons.
+ }
+
+ last_flush_micros_ = clock_->NowMicros();
+}
+
+void WinLogger::Logv(const char* format, va_list ap) {
+ IOSTATS_TIMER_GUARD(logger_nanos);
+ assert(file_ != INVALID_HANDLE_VALUE);
+
+ const uint64_t thread_id = (*gettid_)();
+
+ // We try twice: the first time with a fixed-size stack allocated buffer,
+ // and the second time with a much larger dynamically allocated buffer.
+ char buffer[500];
+ std::unique_ptr<char[]> largeBuffer;
+ for (int iter = 0; iter < 2; ++iter) {
+ char* base;
+ int bufsize;
+ if (iter == 0) {
+ bufsize = sizeof(buffer);
+ base = buffer;
+ } else {
+ bufsize = 30000;
+ largeBuffer.reset(new char[bufsize]);
+ base = largeBuffer.get();
+ }
+
+ char* p = base;
+ char* limit = base + bufsize;
+
+ port::TimeVal now_tv;
+ port::GetTimeOfDay(&now_tv, nullptr);
+ const time_t seconds = now_tv.tv_sec;
+ struct tm t;
+ localtime_s(&t, &seconds);
+ p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+ t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+ t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec),
+ static_cast<long long unsigned int>(thread_id));
+
+ // Print the message
+ if (p < limit) {
+ va_list backup_ap;
+ va_copy(backup_ap, ap);
+ int done = vsnprintf(p, limit - p, format, backup_ap);
+ if (done > 0) {
+ p += done;
+ } else {
+ continue;
+ }
+ va_end(backup_ap);
+ }
+
+ // Truncate to available space if necessary
+ if (p >= limit) {
+ if (iter == 0) {
+ continue; // Try again with larger buffer
+ } else {
+ p = limit - 1;
+ }
+ }
+
+ // Add newline if necessary
+ if (p == base || p[-1] != '\n') {
+ *p++ = '\n';
+ }
+
+ assert(p <= limit);
+ const size_t write_size = p - base;
+
+ DWORD bytesWritten = 0;
+ BOOL ret = WriteFile(file_, base, static_cast<DWORD>(write_size),
+ &bytesWritten, NULL);
+ if (ret == FALSE) {
+ std::string errSz = GetWindowsErrSz(GetLastError());
+ fprintf(stderr, "%s", errSz.c_str());
+ }
+
+ flush_pending_ = true;
+ assert((bytesWritten == write_size) || (ret == FALSE));
+ if (bytesWritten > 0) {
+ log_size_ += write_size;
+ }
+
+ uint64_t now_micros =
+ static_cast<uint64_t>(now_tv.tv_sec) * 1000000 + now_tv.tv_usec;
+ if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+ flush_pending_ = false;
+ // With Windows API writes go to OS buffers directly so no fflush needed
+ // unlike with C runtime API. We don't flush all the way to disk
+ // for perf reasons.
+ last_flush_micros_ = now_micros;
+ }
+ break;
+ }
+}
+
+size_t WinLogger::GetLogFileSize() const { return log_size_; }
+
+} // namespace port
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/win_logger.h b/src/rocksdb/port/win/win_logger.h
new file mode 100644
index 000000000..1ca4610e9
--- /dev/null
+++ b/src/rocksdb/port/win/win_logger.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#pragma once
+
+#include <stdint.h>
+#include <windows.h>
+
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+
+namespace port {
+class WinLogger : public ROCKSDB_NAMESPACE::Logger {
+ public:
+ WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file,
+ const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL);
+
+ virtual ~WinLogger();
+
+ WinLogger(const WinLogger&) = delete;
+
+ WinLogger& operator=(const WinLogger&) = delete;
+
+ void Flush() override;
+
+ using ROCKSDB_NAMESPACE::Logger::Logv;
+ void Logv(const char* format, va_list ap) override;
+
+ size_t GetLogFileSize() const override;
+
+ void DebugWriter(const char* str, int len);
+
+ protected:
+ Status CloseImpl() override;
+
+ private:
+ HANDLE file_;
+ uint64_t (*gettid_)(); // Return the thread id for the current thread
+ std::atomic_size_t log_size_;
+ std::atomic_uint_fast64_t last_flush_micros_;
+ SystemClock* clock_;
+ bool flush_pending_;
+
+ Status CloseInternal();
+
+ const static uint64_t flush_every_seconds_ = 5;
+};
+} // namespace port
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/win/win_thread.cc b/src/rocksdb/port/win/win_thread.cc
new file mode 100644
index 000000000..3c82e736e
--- /dev/null
+++ b/src/rocksdb/port/win/win_thread.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+// Most Mingw builds support std::thread only when using posix threads.
+// In that case, some of these functions will be unavailable.
+// Note that we're using either WindowsThread or std::thread, depending on
+// which one is available.
+#ifndef _POSIX_THREADS
+
+#include "port/win/win_thread.h"
+
+#include <assert.h>
+#include <process.h> // __beginthreadex
+#include <windows.h>
+
+#include <stdexcept>
+#include <system_error>
+#include <thread>
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+struct WindowsThread::Data {
+ std::function<void()> func_;
+ uintptr_t handle_;
+
+ Data(std::function<void()>&& func) : func_(std::move(func)), handle_(0) {}
+
+ Data(const Data&) = delete;
+ Data& operator=(const Data&) = delete;
+
+ static unsigned int __stdcall ThreadProc(void* arg);
+};
+
+void WindowsThread::Init(std::function<void()>&& func) {
+ data_ = std::make_shared<Data>(std::move(func));
+ // We create another instance of std::shared_ptr to get an additional ref
+ // since we may detach and destroy this instance before the threadproc
+ // may start to run. We choose to allocate this additional ref on the heap
+ // so we do not need to synchronize and allow this thread to proceed
+ std::unique_ptr<std::shared_ptr<Data>> th_data(
+ new std::shared_ptr<Data>(data_));
+
+ data_->handle_ = _beginthreadex(NULL,
+ 0, // stack size
+ &Data::ThreadProc, th_data.get(),
+ 0, // init flag
+ &th_id_);
+
+ if (data_->handle_ == 0) {
+ throw std::system_error(
+ std::make_error_code(std::errc::resource_unavailable_try_again),
+ "Unable to create a thread");
+ }
+ th_data.release();
+}
+
+WindowsThread::WindowsThread() : data_(nullptr), th_id_(0) {}
+
+WindowsThread::~WindowsThread() {
+ // Must be joined or detached
+ // before destruction.
+ // This is the same as std::thread
+ if (data_) {
+ if (joinable()) {
+ assert(false);
+ std::terminate();
+ }
+ data_.reset();
+ }
+}
+
+WindowsThread::WindowsThread(WindowsThread&& o) noexcept : WindowsThread() {
+ *this = std::move(o);
+}
+
+WindowsThread& WindowsThread::operator=(WindowsThread&& o) noexcept {
+ if (joinable()) {
+ assert(false);
+ std::terminate();
+ }
+
+ data_ = std::move(o.data_);
+
+ // Per spec both instances will have the same id
+ th_id_ = o.th_id_;
+
+ return *this;
+}
+
+bool WindowsThread::joinable() const { return (data_ && data_->handle_ != 0); }
+
+WindowsThread::native_handle_type WindowsThread::native_handle() const {
+ return reinterpret_cast<native_handle_type>(data_->handle_);
+}
+
+unsigned WindowsThread::hardware_concurrency() {
+ return std::thread::hardware_concurrency();
+}
+
+void WindowsThread::join() {
+ if (!joinable()) {
+ assert(false);
+ throw std::system_error(std::make_error_code(std::errc::invalid_argument),
+ "Thread is no longer joinable");
+ }
+
+ if (GetThreadId(GetCurrentThread()) == th_id_) {
+ assert(false);
+ throw std::system_error(
+ std::make_error_code(std::errc::resource_deadlock_would_occur),
+ "Can not join itself");
+ }
+
+ auto ret =
+ WaitForSingleObject(reinterpret_cast<HANDLE>(data_->handle_), INFINITE);
+ if (ret != WAIT_OBJECT_0) {
+ auto lastError = GetLastError();
+ assert(false);
+ throw std::system_error(static_cast<int>(lastError), std::system_category(),
+ "WaitForSingleObjectFailed: thread join");
+ }
+
+ BOOL rc
+#if defined(_MSC_VER)
+ = FALSE;
+#else
+ __attribute__((__unused__));
+#endif
+ rc = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_));
+ assert(rc != 0);
+ data_->handle_ = 0;
+}
+
+bool WindowsThread::detach() {
+ if (!joinable()) {
+ assert(false);
+ throw std::system_error(std::make_error_code(std::errc::invalid_argument),
+ "Thread is no longer available");
+ }
+
+ BOOL ret = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_));
+ data_->handle_ = 0;
+
+ return (ret != 0);
+}
+
+void WindowsThread::swap(WindowsThread& o) {
+ data_.swap(o.data_);
+ std::swap(th_id_, o.th_id_);
+}
+
+unsigned int __stdcall WindowsThread::Data::ThreadProc(void* arg) {
+ auto ptr = reinterpret_cast<std::shared_ptr<Data>*>(arg);
+ std::unique_ptr<std::shared_ptr<Data>> data(ptr);
+ (*data)->func_();
+ return 0;
+}
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !_POSIX_THREADS
+#endif // OS_WIN
diff --git a/src/rocksdb/port/win/win_thread.h b/src/rocksdb/port/win/win_thread.h
new file mode 100644
index 000000000..916033b77
--- /dev/null
+++ b/src/rocksdb/port/win/win_thread.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef _POSIX_THREADS
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// This class is a replacement for std::thread
+// 2 reasons we do not like std::thread:
+// -- is that it dynamically allocates its internals that are automatically
+// freed when the thread terminates and not on the destruction of the
+// object. This makes it difficult to control the source of memory
+// allocation
+// - This implements Pimpl so we can easily replace the guts of the
+// object in our private version if necessary.
+class WindowsThread {
+ struct Data;
+
+ std::shared_ptr<Data> data_;
+ unsigned int th_id_;
+
+ void Init(std::function<void()>&&);
+
+ public:
+ using native_handle_type = void*;
+
+ // Construct with no thread
+ WindowsThread();
+
+ // Template constructor
+ //
+ // This templated constructor accomplishes several things
+ //
+ // - Allows the class as whole to be not a template
+ //
+ // - take "universal" references to support both _lvalues and _rvalues
+ //
+ // - because this constructor is a catchall case in many respects it
+ // may prevent us from using both the default __ctor, the move __ctor.
+ // Also it may circumvent copy __ctor deletion. To work around this
+ // we make sure this one has at least one argument and eliminate
+ // it from the overload selection when WindowsThread is the first
+ // argument.
+ //
+ // - construct with Fx(Ax...) with a variable number of types/arguments.
+ //
+ // - Gathers together the callable object with its arguments and constructs
+ // a single callable entity
+ //
+ // - Makes use of std::function to convert it to a specification-template
+ // dependent type that both checks the signature conformance to ensure
+ // that all of the necessary arguments are provided and allows pimpl
+ // implementation.
+ template <class Fn, class... Args,
+ class = typename std::enable_if<!std::is_same<
+ typename std::decay<Fn>::type, WindowsThread>::value>::type>
+ explicit WindowsThread(Fn&& fx, Args&&... ax) : WindowsThread() {
+ // Use binder to create a single callable entity
+ auto binder = std::bind(std::forward<Fn>(fx), std::forward<Args>(ax)...);
+ // Use std::function to take advantage of the type erasure
+ // so we can still hide implementation within pimpl
+ // This also makes sure that the binder signature is compliant
+ std::function<void()> target = binder;
+
+ Init(std::move(target));
+ }
+
+ ~WindowsThread();
+
+ WindowsThread(const WindowsThread&) = delete;
+
+ WindowsThread& operator=(const WindowsThread&) = delete;
+
+ WindowsThread(WindowsThread&&) noexcept;
+
+ WindowsThread& operator=(WindowsThread&&) noexcept;
+
+ bool joinable() const;
+
+ unsigned int get_id() const { return th_id_; }
+
+ native_handle_type native_handle() const;
+
+ static unsigned hardware_concurrency();
+
+ void join();
+
+ bool detach();
+
+ void swap(WindowsThread&);
+};
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+namespace std {
+inline void swap(ROCKSDB_NAMESPACE::port::WindowsThread& th1,
+ ROCKSDB_NAMESPACE::port::WindowsThread& th2) {
+ th1.swap(th2);
+}
+} // namespace std
+
+#endif // !_POSIX_THREADS
diff --git a/src/rocksdb/port/win/xpress_win.cc b/src/rocksdb/port/win/xpress_win.cc
new file mode 100644
index 000000000..21904d502
--- /dev/null
+++ b/src/rocksdb/port/win/xpress_win.cc
@@ -0,0 +1,210 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include "port/win/xpress_win.h"
+
+#include <windows.h>
+
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <memory>
+
+#ifdef XPRESS
+
+// Put this under ifdef so windows systems w/o this
+// can still build
+#include <compressapi.h>
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+namespace xpress {
+
+// Helpers
+namespace {
+
+auto CloseCompressorFun = [](void* h) {
+ if (NULL != h) {
+ ::CloseCompressor(reinterpret_cast<COMPRESSOR_HANDLE>(h));
+ }
+};
+
+auto CloseDecompressorFun = [](void* h) {
+ if (NULL != h) {
+ ::CloseDecompressor(reinterpret_cast<DECOMPRESSOR_HANDLE>(h));
+ }
+};
+} // namespace
+
+bool Compress(const char* input, size_t length, std::string* output) {
+ assert(input != nullptr);
+ assert(output != nullptr);
+
+ if (length == 0) {
+ output->clear();
+ return true;
+ }
+
+ COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
+ COMPRESSOR_HANDLE compressor = NULL;
+
+ BOOL success =
+ CreateCompressor(COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm
+ allocRoutinesPtr, // Optional allocation routine
+ &compressor); // Handle
+
+ if (!success) {
+#ifdef _DEBUG
+ std::cerr << "XPRESS: Failed to create Compressor LastError: "
+ << GetLastError() << std::endl;
+#endif
+ return false;
+ }
+
+ std::unique_ptr<void, decltype(CloseCompressorFun)> compressorGuard(
+ compressor, CloseCompressorFun);
+
+ SIZE_T compressedBufferSize = 0;
+
+ // Query compressed buffer size.
+ success = ::Compress(compressor, // Compressor Handle
+ const_cast<char*>(input), // Input buffer
+ length, // Uncompressed data size
+ NULL, // Compressed Buffer
+ 0, // Compressed Buffer size
+ &compressedBufferSize); // Compressed Data size
+
+ if (!success) {
+ auto lastError = GetLastError();
+
+ if (lastError != ERROR_INSUFFICIENT_BUFFER) {
+#ifdef _DEBUG
+ std::cerr
+ << "XPRESS: Failed to estimate compressed buffer size LastError "
+ << lastError << std::endl;
+#endif
+ return false;
+ }
+ }
+
+ assert(compressedBufferSize > 0);
+
+ std::string result;
+ result.resize(compressedBufferSize);
+
+ SIZE_T compressedDataSize = 0;
+
+ // Compress
+ success = ::Compress(compressor, // Compressor Handle
+ const_cast<char*>(input), // Input buffer
+ length, // Uncompressed data size
+ &result[0], // Compressed Buffer
+ compressedBufferSize, // Compressed Buffer size
+ &compressedDataSize); // Compressed Data size
+
+ if (!success) {
+#ifdef _DEBUG
+ std::cerr << "XPRESS: Failed to compress LastError " << GetLastError()
+ << std::endl;
+#endif
+ return false;
+ }
+
+ result.resize(compressedDataSize);
+ output->swap(result);
+
+ return true;
+}
+
+char* Decompress(const char* input_data, size_t input_length,
+ size_t* uncompressed_size) {
+ assert(input_data != nullptr);
+ assert(uncompressed_size != nullptr);
+
+ if (input_length == 0) {
+ return nullptr;
+ }
+
+ COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
+ DECOMPRESSOR_HANDLE decompressor = NULL;
+
+ BOOL success =
+ CreateDecompressor(COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm
+ allocRoutinesPtr, // Optional allocation routine
+ &decompressor); // Handle
+
+ if (!success) {
+#ifdef _DEBUG
+ std::cerr << "XPRESS: Failed to create Decompressor LastError "
+ << GetLastError() << std::endl;
+#endif
+ return nullptr;
+ }
+
+ std::unique_ptr<void, decltype(CloseDecompressorFun)> compressorGuard(
+ decompressor, CloseDecompressorFun);
+
+ SIZE_T decompressedBufferSize = 0;
+
+ success = ::Decompress(decompressor, // Compressor Handle
+ const_cast<char*>(input_data), // Compressed data
+ input_length, // Compressed data size
+ NULL, // Buffer set to NULL
+ 0, // Buffer size set to 0
+ &decompressedBufferSize); // Decompressed Data size
+
+ if (!success) {
+ auto lastError = GetLastError();
+
+ if (lastError != ERROR_INSUFFICIENT_BUFFER) {
+#ifdef _DEBUG
+ std::cerr
+ << "XPRESS: Failed to estimate decompressed buffer size LastError "
+ << lastError << std::endl;
+#endif
+ return nullptr;
+ }
+ }
+
+ assert(decompressedBufferSize > 0);
+
+ // The callers are deallocating using delete[]
+ // thus we must allocate with new[]
+ std::unique_ptr<char[]> outputBuffer(new char[decompressedBufferSize]);
+
+ SIZE_T decompressedDataSize = 0;
+
+ success = ::Decompress(decompressor, const_cast<char*>(input_data),
+ input_length, outputBuffer.get(),
+ decompressedBufferSize, &decompressedDataSize);
+
+ if (!success) {
+#ifdef _DEBUG
+ std::cerr << "XPRESS: Failed to decompress LastError " << GetLastError()
+ << std::endl;
+#endif
+ return nullptr;
+ }
+
+ *uncompressed_size = decompressedDataSize;
+
+ // Return the raw buffer to the caller supporting the tradition
+ return outputBuffer.release();
+}
+} // namespace xpress
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+#endif
+
+#endif
diff --git a/src/rocksdb/port/win/xpress_win.h b/src/rocksdb/port/win/xpress_win.h
new file mode 100644
index 000000000..187adffa6
--- /dev/null
+++ b/src/rocksdb/port/win/xpress_win.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+namespace xpress {
+
+bool Compress(const char* input, size_t length, std::string* output);
+
+char* Decompress(const char* input_data, size_t input_length,
+ size_t* uncompressed_size);
+} // namespace xpress
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/xpress.h b/src/rocksdb/port/xpress.h
new file mode 100644
index 000000000..457025f66
--- /dev/null
+++ b/src/rocksdb/port/xpress.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+// Xpress on Windows is implemeted using Win API
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#error "Xpress compression not implemented"
+#elif defined(OS_WIN)
+#include "port/win/xpress_win.h"
+#endif