summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/port/win/io_win.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/port/win/io_win.cc')
-rw-r--r--src/rocksdb/port/win/io_win.cc1069
1 files changed, 1069 insertions, 0 deletions
diff --git a/src/rocksdb/port/win/io_win.cc b/src/rocksdb/port/win/io_win.cc
new file mode 100644
index 000000000..e6c6890ef
--- /dev/null
+++ b/src/rocksdb/port/win/io_win.cc
@@ -0,0 +1,1069 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/win/io_win.h"
+
+#include "monitoring/iostats_context_imp.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+/*
+* DirectIOHelper
+*/
+namespace {
+
+const size_t kSectorSize = 512;
+
+inline
+bool IsPowerOfTwo(const size_t alignment) {
+ return ((alignment) & (alignment - 1)) == 0;
+}
+
+inline
+bool IsSectorAligned(const size_t off) {
+ return (off & (kSectorSize - 1)) == 0;
+}
+
+inline
+bool IsAligned(size_t alignment, const void* ptr) {
+ return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
+}
+}
+
+
+std::string GetWindowsErrSz(DWORD err) {
+ LPSTR lpMsgBuf;
+ FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, err,
+ 0, // Default language
+ reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
+
+ std::string Err = lpMsgBuf;
+ LocalFree(lpMsgBuf);
+ return Err;
+}
+
+// We preserve the original name of this interface to denote the original idea
+// behind it.
+// All reads happen by a specified offset and pwrite interface does not change
+// the position of the file pointer. Judging from the man page and errno it does
+// execute
+// lseek atomically to return the position of the file back where it was.
+// WriteFile() does not
+// have this capability. Therefore, for both pread and pwrite the pointer is
+// advanced to the next position
+// which is fine for writes because they are (should be) sequential.
+// Because all the reads/writes happen by the specified offset, the caller in
+// theory should not
+// rely on the current file offset.
+Status pwrite(const WinFileData* file_data, const Slice& data,
+ uint64_t offset, size_t& bytes_written) {
+
+ Status s;
+ bytes_written = 0;
+
+ size_t num_bytes = data.size();
+ if (num_bytes > std::numeric_limits<DWORD>::max()) {
+ // May happen in 64-bit builds where size_t is 64-bits but
+ // long is still 32-bit, but that's the API here at the moment
+ return Status::InvalidArgument("num_bytes is too large for a single write: " +
+ file_data->GetName());
+ }
+
+ OVERLAPPED overlapped = { 0 };
+ ULARGE_INTEGER offsetUnion;
+ offsetUnion.QuadPart = offset;
+
+ overlapped.Offset = offsetUnion.LowPart;
+ overlapped.OffsetHigh = offsetUnion.HighPart;
+
+ DWORD bytesWritten = 0;
+
+ if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast<DWORD>(num_bytes),
+ &bytesWritten, &overlapped)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
+ lastError);
+ } else {
+ bytes_written = bytesWritten;
+ }
+
+ return s;
+}
+
+// See comments for pwrite above
+Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
+ uint64_t offset, size_t& bytes_read) {
+
+ Status s;
+ bytes_read = 0;
+
+ if (num_bytes > std::numeric_limits<DWORD>::max()) {
+ return Status::InvalidArgument("num_bytes is too large for a single read: " +
+ file_data->GetName());
+ }
+
+ OVERLAPPED overlapped = { 0 };
+ ULARGE_INTEGER offsetUnion;
+ offsetUnion.QuadPart = offset;
+
+ overlapped.Offset = offsetUnion.LowPart;
+ overlapped.OffsetHigh = offsetUnion.HighPart;
+
+ DWORD bytesRead = 0;
+
+ if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast<DWORD>(num_bytes),
+ &bytesRead, &overlapped)) {
+ auto lastError = GetLastError();
+ // EOF is OK with zero bytes read
+ if (lastError != ERROR_HANDLE_EOF) {
+ s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
+ lastError);
+ }
+ } else {
+ bytes_read = bytesRead;
+ }
+
+ return s;
+}
+
+// SetFileInformationByHandle() is capable of fast pre-allocates.
+// However, this does not change the file end position unless the file is
+// truncated and the pre-allocated space is not considered filled with zeros.
+Status fallocate(const std::string& filename, HANDLE hFile,
+ uint64_t to_size) {
+ Status status;
+
+ FILE_ALLOCATION_INFO alloc_info;
+ alloc_info.AllocationSize.QuadPart = to_size;
+
+ if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
+ sizeof(FILE_ALLOCATION_INFO))) {
+ auto lastError = GetLastError();
+ status = IOErrorFromWindowsError(
+ "Failed to pre-allocate space: " + filename, lastError);
+ }
+
+ return status;
+}
+
+Status ftruncate(const std::string& filename, HANDLE hFile,
+ uint64_t toSize) {
+ Status status;
+
+ FILE_END_OF_FILE_INFO end_of_file;
+ end_of_file.EndOfFile.QuadPart = toSize;
+
+ if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
+ sizeof(FILE_END_OF_FILE_INFO))) {
+ auto lastError = GetLastError();
+ status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
+ lastError);
+ }
+
+ return status;
+}
+
+size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
+ size_t /*max_size*/) {
+ // Returning 0 is safe as it causes the table reader to generate a unique ID.
+ // This is suboptimal for performance as it prevents multiple table readers
+ // for the same file from sharing cached blocks. For example, if users have
+ // a low value for `max_open_files`, there can be many table readers opened
+ // for the same file.
+ //
+ // TODO: this is a temporarily solution as it is safe but not optimal for
+ // performance. For more details see discussion in
+ // https://github.com/facebook/rocksdb/pull/5844.
+ return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// WinMmapReadableFile
+
+WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
+ HANDLE hFile, HANDLE hMap,
+ const void* mapped_region,
+ size_t length)
+ : WinFileData(fileName, hFile, false /* use_direct_io */),
+ hMap_(hMap),
+ mapped_region_(mapped_region),
+ length_(length) {}
+
+WinMmapReadableFile::~WinMmapReadableFile() {
+ BOOL ret __attribute__((__unused__));
+ ret = ::UnmapViewOfFile(mapped_region_);
+ assert(ret);
+
+ ret = ::CloseHandle(hMap_);
+ assert(ret);
+}
+
+Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ Status s;
+
+ if (offset > length_) {
+ *result = Slice();
+ return IOError(filename_, EINVAL);
+ } else if (offset + n > length_) {
+ n = length_ - static_cast<size_t>(offset);
+ }
+ *result =
+ Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
+ return s;
+}
+
+Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
+ return Status::OK();
+}
+
+size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(hFile_, id, max_size);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// WinMmapFile
+
+
+// Can only truncate or reserve to a sector size aligned if
+// used on files that are opened with Unbuffered I/O
+Status WinMmapFile::TruncateFile(uint64_t toSize) {
+ return ftruncate(filename_, hFile_, toSize);
+}
+
+Status WinMmapFile::UnmapCurrentRegion() {
+ Status status;
+
+ if (mapped_begin_ != nullptr) {
+ if (!::UnmapViewOfFile(mapped_begin_)) {
+ status = IOErrorFromWindowsError(
+ "Failed to unmap file view: " + filename_, GetLastError());
+ }
+
+ // Move on to the next portion of the file
+ file_offset_ += view_size_;
+
+ // UnmapView automatically sends data to disk but not the metadata
+ // which is good and provides some equivalent of fdatasync() on Linux
+ // therefore, we donot need separate flag for metadata
+ mapped_begin_ = nullptr;
+ mapped_end_ = nullptr;
+ dst_ = nullptr;
+
+ last_sync_ = nullptr;
+ pending_sync_ = false;
+ }
+
+ return status;
+}
+
+Status WinMmapFile::MapNewRegion() {
+
+ Status status;
+
+ assert(mapped_begin_ == nullptr);
+
+ size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
+
+ if (minDiskSize > reserved_size_) {
+ status = Allocate(file_offset_, view_size_);
+ if (!status.ok()) {
+ return status;
+ }
+ }
+
+ // Need to remap
+ if (hMap_ == NULL || reserved_size_ > mapping_size_) {
+
+ if (hMap_ != NULL) {
+ // Unmap the previous one
+ BOOL ret __attribute__((__unused__));
+ ret = ::CloseHandle(hMap_);
+ assert(ret);
+ hMap_ = NULL;
+ }
+
+ ULARGE_INTEGER mappingSize;
+ mappingSize.QuadPart = reserved_size_;
+
+ hMap_ = CreateFileMappingA(
+ hFile_,
+ NULL, // Security attributes
+ PAGE_READWRITE, // There is not a write only mode for mapping
+ mappingSize.HighPart, // Enable mapping the whole file but the actual
+ // amount mapped is determined by MapViewOfFile
+ mappingSize.LowPart,
+ NULL); // Mapping name
+
+ if (NULL == hMap_) {
+ return IOErrorFromWindowsError(
+ "WindowsMmapFile failed to create file mapping for: " + filename_,
+ GetLastError());
+ }
+
+ mapping_size_ = reserved_size_;
+ }
+
+ ULARGE_INTEGER offset;
+ offset.QuadPart = file_offset_;
+
+ // View must begin at the granularity aligned offset
+ mapped_begin_ = reinterpret_cast<char*>(
+ MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
+ view_size_, NULL));
+
+ if (!mapped_begin_) {
+ status = IOErrorFromWindowsError(
+ "WindowsMmapFile failed to map file view: " + filename_,
+ GetLastError());
+ } else {
+ mapped_end_ = mapped_begin_ + view_size_;
+ dst_ = mapped_begin_;
+ last_sync_ = mapped_begin_;
+ pending_sync_ = false;
+ }
+ return status;
+}
+
+Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
+ return fallocate(filename_, hFile_, spaceToReserve);
+}
+
+WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
+ size_t page_size, size_t allocation_granularity,
+ const EnvOptions& options)
+ : WinFileData(fname, hFile, false),
+ WritableFile(options),
+ hMap_(NULL),
+ page_size_(page_size),
+ allocation_granularity_(allocation_granularity),
+ reserved_size_(0),
+ mapping_size_(0),
+ view_size_(0),
+ mapped_begin_(nullptr),
+ mapped_end_(nullptr),
+ dst_(nullptr),
+ last_sync_(nullptr),
+ file_offset_(0),
+ pending_sync_(false) {
+ // Allocation granularity must be obtained from GetSystemInfo() and must be
+ // a power of two.
+ assert(allocation_granularity > 0);
+ assert((allocation_granularity & (allocation_granularity - 1)) == 0);
+
+ assert(page_size > 0);
+ assert((page_size & (page_size - 1)) == 0);
+
+ // Only for memory mapped writes
+ assert(options.use_mmap_writes);
+
+ // View size must be both the multiple of allocation_granularity AND the
+ // page size and the granularity is usually a multiple of a page size.
+ const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
+ view_size_ = Roundup(viewSize, allocation_granularity_);
+}
+
+WinMmapFile::~WinMmapFile() {
+ if (hFile_) {
+ this->Close();
+ }
+}
+
+Status WinMmapFile::Append(const Slice& data) {
+ const char* src = data.data();
+ size_t left = data.size();
+
+ while (left > 0) {
+ assert(mapped_begin_ <= dst_);
+ size_t avail = mapped_end_ - dst_;
+
+ if (avail == 0) {
+ Status s = UnmapCurrentRegion();
+ if (s.ok()) {
+ s = MapNewRegion();
+ }
+
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ size_t n = std::min(left, avail);
+ memcpy(dst_, src, n);
+ dst_ += n;
+ src += n;
+ left -= n;
+ pending_sync_ = true;
+ }
+ }
+
+ // Now make sure that the last partial page is padded with zeros if needed
+ size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
+ if (bytesToPad > 0) {
+ memset(dst_, 0, bytesToPad);
+ }
+
+ return Status::OK();
+}
+
+// Means Close() will properly take care of truncate
+// and it does not need any additional information
+Status WinMmapFile::Truncate(uint64_t size) {
+ return Status::OK();
+}
+
+Status WinMmapFile::Close() {
+ Status s;
+
+ assert(NULL != hFile_);
+
+ // We truncate to the precise size so no
+ // uninitialized data at the end. SetEndOfFile
+ // which we use does not write zeros and it is good.
+ uint64_t targetSize = GetFileSize();
+
+ if (mapped_begin_ != nullptr) {
+ // Sync before unmapping to make sure everything
+ // is on disk and there is not a lazy writing
+ // so we are deterministic with the tests
+ Sync();
+ s = UnmapCurrentRegion();
+ }
+
+ if (NULL != hMap_) {
+ BOOL ret = ::CloseHandle(hMap_);
+ if (!ret && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to Close mapping for file: " + filename_, lastError);
+ }
+
+ hMap_ = NULL;
+ }
+
+ if (hFile_ != NULL) {
+
+ TruncateFile(targetSize);
+
+ BOOL ret = ::CloseHandle(hFile_);
+ hFile_ = NULL;
+
+ if (!ret && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to close file map handle: " + filename_, lastError);
+ }
+ }
+
+ return s;
+}
+
+Status WinMmapFile::Flush() { return Status::OK(); }
+
+// Flush only data
+Status WinMmapFile::Sync() {
+ Status s;
+
+ // Some writes occurred since last sync
+ if (dst_ > last_sync_) {
+ assert(mapped_begin_);
+ assert(dst_);
+ assert(dst_ > mapped_begin_);
+ assert(dst_ < mapped_end_);
+
+ size_t page_begin =
+ TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
+ size_t page_end =
+ TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
+
+ // Flush only the amount of that is a multiple of pages
+ if (!::FlushViewOfFile(mapped_begin_ + page_begin,
+ (page_end - page_begin) + page_size_)) {
+ s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
+ GetLastError());
+ } else {
+ last_sync_ = dst_;
+ }
+ }
+
+ return s;
+}
+
+/**
+* Flush data as well as metadata to stable storage.
+*/
+Status WinMmapFile::Fsync() {
+ Status s = Sync();
+
+ // Flush metadata
+ if (s.ok() && pending_sync_) {
+ if (!::FlushFileBuffers(hFile_)) {
+ s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
+ GetLastError());
+ }
+ pending_sync_ = false;
+ }
+
+ return s;
+}
+
+/**
+* Get the size of valid data in the file. This will not match the
+* size that is returned from the filesystem because we use mmap
+* to extend file by map_size every time.
+*/
+uint64_t WinMmapFile::GetFileSize() {
+ size_t used = dst_ - mapped_begin_;
+ return file_offset_ + used;
+}
+
+Status WinMmapFile::InvalidateCache(size_t offset, size_t length) {
+ return Status::OK();
+}
+
+Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) {
+ Status status;
+ TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
+
+ // Make sure that we reserve an aligned amount of space
+ // since the reservation block size is driven outside so we want
+ // to check if we are ok with reservation here
+ size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), view_size_);
+ // Nothing to do
+ if (spaceToReserve <= reserved_size_) {
+ return status;
+ }
+
+ IOSTATS_TIMER_GUARD(allocate_nanos);
+ status = PreallocateInternal(spaceToReserve);
+ if (status.ok()) {
+ reserved_size_ = spaceToReserve;
+ }
+ return status;
+}
+
+size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(hFile_, id, max_size);
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+// WinSequentialFile
+
+WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
+ const EnvOptions& options)
+ : WinFileData(fname, f, options.use_direct_reads) {}
+
+WinSequentialFile::~WinSequentialFile() {
+ assert(hFile_ != INVALID_HANDLE_VALUE);
+}
+
+Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
+ Status s;
+ size_t r = 0;
+
+ assert(result != nullptr);
+ if (WinFileData::use_direct_io()) {
+ return Status::NotSupported("Read() does not support direct_io");
+ }
+
+ // Windows ReadFile API accepts a DWORD.
+ // While it is possible to read in a loop if n is too big
+ // it is an unlikely case.
+ if (n > std::numeric_limits<DWORD>::max()) {
+ return Status::InvalidArgument("n is too big for a single ReadFile: "
+ + filename_);
+ }
+
+ DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
+ DWORD bytesRead = 0;
+ BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
+ if (ret != FALSE) {
+ r = bytesRead;
+ } else {
+ auto lastError = GetLastError();
+ if (lastError != ERROR_HANDLE_EOF) {
+ s = IOErrorFromWindowsError("ReadFile failed: " + filename_,
+ lastError);
+ }
+ }
+
+ *result = Slice(scratch, r);
+ return s;
+}
+
+Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
+ uint64_t offset, size_t& bytes_read) const {
+ return pread(this, src, numBytes, offset, bytes_read);
+}
+
+Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result,
+ char* scratch) {
+
+ Status s;
+
+ if (!WinFileData::use_direct_io()) {
+ return Status::NotSupported("This function is only used for direct_io");
+ }
+
+ if (!IsSectorAligned(static_cast<size_t>(offset)) ||
+ !IsSectorAligned(n)) {
+ return Status::InvalidArgument(
+ "WinSequentialFile::PositionedRead: offset is not properly aligned");
+ }
+
+ size_t bytes_read = 0; // out param
+ s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset, bytes_read);
+ *result = Slice(scratch, bytes_read);
+ return s;
+}
+
+
+Status WinSequentialFile::Skip(uint64_t n) {
+ // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
+ // integer. As such it is a highly unlikley case to have n so large.
+ if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
+ return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" +
+ filename_);
+ }
+
+ LARGE_INTEGER li;
+ li.QuadPart = static_cast<LONGLONG>(n); //cast is safe due to the check above
+ BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
+ if (ret == FALSE) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
+ lastError);
+ }
+ return Status::OK();
+}
+
+Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
+ return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/// WinRandomAccessBase
+
+inline
+Status WinRandomAccessImpl::PositionedReadInternal(char* src,
+ size_t numBytes,
+ uint64_t offset,
+ size_t& bytes_read) const {
+ return pread(file_base_, src, numBytes, offset, bytes_read);
+}
+
+inline
+WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
+ size_t alignment,
+ const EnvOptions& options) :
+ file_base_(file_base),
+ alignment_(alignment) {
+
+ assert(!options.use_mmap_reads);
+}
+
+inline
+Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+
+ Status s;
+
+ // Check buffer alignment
+ if (file_base_->use_direct_io()) {
+ if (!IsSectorAligned(static_cast<size_t>(offset)) ||
+ !IsAligned(alignment_, scratch)) {
+ return Status::InvalidArgument(
+ "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned");
+ }
+ }
+
+ if (n == 0) {
+ *result = Slice(scratch, 0);
+ return s;
+ }
+
+ size_t bytes_read = 0;
+ s = PositionedReadInternal(scratch, n, offset, bytes_read);
+ *result = Slice(scratch, bytes_read);
+ return s;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// WinRandomAccessFile
+
+WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
+ size_t alignment,
+ const EnvOptions& options)
+ : WinFileData(fname, hFile, options.use_direct_reads),
+ WinRandomAccessImpl(this, alignment, options) {}
+
+WinRandomAccessFile::~WinRandomAccessFile() {
+}
+
+Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ return ReadImpl(offset, n, result, scratch);
+}
+
+Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
+ return Status::OK();
+}
+
+size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
+}
+
+size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
+ return GetAlignment();
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// WinWritableImpl
+//
+
+inline
+Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
+ return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve);
+}
+
+inline
+WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
+ : file_data_(file_data),
+ alignment_(alignment),
+ next_write_offset_(0),
+ reservedsize_(0) {
+
+ // Query current position in case ReopenWritableFile is called
+ // This position is only important for buffered writes
+ // for unbuffered writes we explicitely specify the position.
+ LARGE_INTEGER zero_move;
+ zero_move.QuadPart = 0; // Do not move
+ LARGE_INTEGER pos;
+ pos.QuadPart = 0;
+ BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
+ FILE_CURRENT);
+ // Querying no supped to fail
+ if (ret != 0) {
+ next_write_offset_ = pos.QuadPart;
+ } else {
+ assert(false);
+ }
+}
+
+inline
+Status WinWritableImpl::AppendImpl(const Slice& data) {
+
+ Status s;
+
+ if (data.size() > std::numeric_limits<DWORD>::max()) {
+ return Status::InvalidArgument("data is too long for a single write" +
+ file_data_->GetName());
+ }
+
+ size_t bytes_written = 0; // out param
+
+ if (file_data_->use_direct_io()) {
+ // With no offset specified we are appending
+ // to the end of the file
+ assert(IsSectorAligned(next_write_offset_));
+ if (!IsSectorAligned(data.size()) ||
+ !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
+ s = Status::InvalidArgument(
+ "WriteData must be page aligned, size must be sector aligned");
+ } else {
+ s = pwrite(file_data_, data, next_write_offset_, bytes_written);
+ }
+ } else {
+
+ DWORD bytesWritten = 0;
+ if (!WriteFile(file_data_->GetFileHandle(), data.data(),
+ static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to WriteFile: " + file_data_->GetName(),
+ lastError);
+ } else {
+ bytes_written = bytesWritten;
+ }
+ }
+
+ if(s.ok()) {
+ if (bytes_written == data.size()) {
+ // This matters for direct_io cases where
+ // we rely on the fact that next_write_offset_
+ // is sector aligned
+ next_write_offset_ += bytes_written;
+ } else {
+ s = Status::IOError("Failed to write all bytes: " +
+ file_data_->GetName());
+ }
+ }
+
+ return s;
+}
+
+inline
+Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
+
+ if(file_data_->use_direct_io()) {
+ if (!IsSectorAligned(static_cast<size_t>(offset)) ||
+ !IsSectorAligned(data.size()) ||
+ !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
+ return Status::InvalidArgument(
+ "Data and offset must be page aligned, size must be sector aligned");
+ }
+ }
+
+ size_t bytes_written = 0;
+ Status s = pwrite(file_data_, data, offset, bytes_written);
+
+ if(s.ok()) {
+ if (bytes_written == data.size()) {
+ // For sequential write this would be simple
+ // size extension by data.size()
+ uint64_t write_end = offset + bytes_written;
+ if (write_end >= next_write_offset_) {
+ next_write_offset_ = write_end;
+ }
+ } else {
+ s = Status::IOError("Failed to write all of the requested data: " +
+ file_data_->GetName());
+ }
+ }
+ return s;
+}
+
+inline
+Status WinWritableImpl::TruncateImpl(uint64_t size) {
+
+ // It is tempting to check for the size for sector alignment
+ // but truncation may come at the end and there is not a requirement
+ // for this to be sector aligned so long as we do not attempt to write
+ // after that. The interface docs state that the behavior is undefined
+ // in that case.
+ Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(),
+ size);
+
+ if (s.ok()) {
+ next_write_offset_ = size;
+ }
+ return s;
+}
+
+inline
+Status WinWritableImpl::CloseImpl() {
+
+ Status s;
+
+ auto hFile = file_data_->GetFileHandle();
+ assert(INVALID_HANDLE_VALUE != hFile);
+
+ if (!::FlushFileBuffers(hFile)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " +
+ file_data_->GetName(),
+ lastError);
+ }
+
+ if(!file_data_->CloseFile() && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(),
+ lastError);
+ }
+ return s;
+}
+
+inline
+Status WinWritableImpl::SyncImpl() {
+ Status s;
+ if (!::FlushFileBuffers (file_data_->GetFileHandle())) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError);
+ }
+ return s;
+}
+
+
+inline
+Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
+ Status status;
+ TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds);
+
+ // Make sure that we reserve an aligned amount of space
+ // since the reservation block size is driven outside so we want
+ // to check if we are ok with reservation here
+ size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), static_cast<size_t>(alignment_));
+ // Nothing to do
+ if (spaceToReserve <= reservedsize_) {
+ return status;
+ }
+
+ IOSTATS_TIMER_GUARD(allocate_nanos);
+ status = PreallocateInternal(spaceToReserve);
+ if (status.ok()) {
+ reservedsize_ = spaceToReserve;
+ }
+ return status;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+/// WinWritableFile
+
+WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
+ size_t alignment, size_t /* capacity */,
+ const EnvOptions& options)
+ : WinFileData(fname, hFile, options.use_direct_writes),
+ WinWritableImpl(this, alignment),
+ WritableFile(options) {
+ assert(!options.use_mmap_writes);
+}
+
+WinWritableFile::~WinWritableFile() {
+}
+
+// Indicates if the class makes use of direct I/O
+bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
+
+size_t WinWritableFile::GetRequiredBufferAlignment() const {
+ return static_cast<size_t>(GetAlignement());
+}
+
+Status WinWritableFile::Append(const Slice& data) {
+ return AppendImpl(data);
+}
+
+Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
+ return PositionedAppendImpl(data, offset);
+}
+
+// Need to implement this so the file is truncated correctly
+// when buffered and unbuffered mode
+Status WinWritableFile::Truncate(uint64_t size) {
+ return TruncateImpl(size);
+}
+
+Status WinWritableFile::Close() {
+ return CloseImpl();
+}
+
+ // write out the cached data to the OS cache
+ // This is now taken care of the WritableFileWriter
+Status WinWritableFile::Flush() {
+ return Status::OK();
+}
+
+Status WinWritableFile::Sync() {
+ return SyncImpl();
+}
+
+Status WinWritableFile::Fsync() { return SyncImpl(); }
+
+bool WinWritableFile::IsSyncThreadSafe() const { return true; }
+
+uint64_t WinWritableFile::GetFileSize() {
+ return GetFileNextWriteOffset();
+}
+
+Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) {
+ return AllocateImpl(offset, len);
+}
+
+size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
+}
+
+/////////////////////////////////////////////////////////////////////////
+/// WinRandomRWFile
+
+WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
+ size_t alignment, const EnvOptions& options)
+ : WinFileData(fname, hFile,
+ options.use_direct_reads && options.use_direct_writes),
+ WinRandomAccessImpl(this, alignment, options),
+ WinWritableImpl(this, alignment) {}
+
+bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
+
+size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
+ return static_cast<size_t>(GetAlignement());
+}
+
+Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
+ return PositionedAppendImpl(data, offset);
+}
+
+Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ return ReadImpl(offset, n, result, scratch);
+}
+
+Status WinRandomRWFile::Flush() {
+ return Status::OK();
+}
+
+Status WinRandomRWFile::Sync() {
+ return SyncImpl();
+}
+
+Status WinRandomRWFile::Close() {
+ return CloseImpl();
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// WinMemoryMappedBufer
+WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
+ BOOL ret
+#if defined(_MSC_VER)
+ = FALSE;
+#else
+ __attribute__((__unused__));
+#endif
+ if (base_ != nullptr) {
+ ret = ::UnmapViewOfFile(base_);
+ assert(ret);
+ base_ = nullptr;
+ }
+ if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
+ ret = ::CloseHandle(map_handle_);
+ assert(ret);
+ map_handle_ = NULL;
+ }
+ if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
+ ret = ::CloseHandle(file_handle_);
+ assert(ret);
+ file_handle_ = NULL;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// WinDirectory
+
+Status WinDirectory::Fsync() { return Status::OK(); }
+
+size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(handle_, id, max_size);
+}
+//////////////////////////////////////////////////////////////////////////
+/// WinFileLock
+
+WinFileLock::~WinFileLock() {
+ BOOL ret __attribute__((__unused__));
+ ret = ::CloseHandle(hFile_);
+ assert(ret);
+}
+
+}
+} // namespace ROCKSDB_NAMESPACE