summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/port/win/io_win.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/rocksdb/port/win/io_win.cc1101
1 files changed, 1101 insertions, 0 deletions
diff --git a/src/rocksdb/port/win/io_win.cc b/src/rocksdb/port/win/io_win.cc
new file mode 100644
index 000000000..4fa735518
--- /dev/null
+++ b/src/rocksdb/port/win/io_win.cc
@@ -0,0 +1,1101 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include "port/win/io_win.h"
+
+#include "env_win.h"
+#include "monitoring/iostats_context_imp.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+/*
+ * DirectIOHelper
+ */
+namespace {
+
+const size_t kSectorSize = 512;
+
+inline bool IsPowerOfTwo(const size_t alignment) {
+ return ((alignment) & (alignment - 1)) == 0;
+}
+
+inline bool IsAligned(size_t alignment, const void* ptr) {
+ return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
+}
+} // namespace
+
+std::string GetWindowsErrSz(DWORD err) {
+ std::string Err;
+ LPSTR lpMsgBuf = nullptr;
+ FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, err,
+ 0, // Default language
+ reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
+
+ if (lpMsgBuf) {
+ Err = lpMsgBuf;
+ LocalFree(lpMsgBuf);
+ }
+ return Err;
+}
+
+// We preserve the original name of this interface to denote the original idea
+// behind it.
+// All reads happen by a specified offset and pwrite interface does not change
+// the position of the file pointer. Judging from the man page and errno it does
+// execute
+// lseek atomically to return the position of the file back where it was.
+// WriteFile() does not
+// have this capability. Therefore, for both pread and pwrite the pointer is
+// advanced to the next position
+// which is fine for writes because they are (should be) sequential.
+// Because all the reads/writes happen by the specified offset, the caller in
+// theory should not
+// rely on the current file offset.
+IOStatus pwrite(const WinFileData* file_data, const Slice& data,
+ uint64_t offset, size_t& bytes_written) {
+ IOStatus s;
+ bytes_written = 0;
+
+ size_t num_bytes = data.size();
+ if (num_bytes > std::numeric_limits<DWORD>::max()) {
+ // May happen in 64-bit builds where size_t is 64-bits but
+ // long is still 32-bit, but that's the API here at the moment
+ return IOStatus::InvalidArgument(
+ "num_bytes is too large for a single write: " + file_data->GetName());
+ }
+
+ OVERLAPPED overlapped = {0};
+ ULARGE_INTEGER offsetUnion;
+ offsetUnion.QuadPart = offset;
+
+ overlapped.Offset = offsetUnion.LowPart;
+ overlapped.OffsetHigh = offsetUnion.HighPart;
+
+ DWORD bytesWritten = 0;
+
+ if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(),
+ static_cast<DWORD>(num_bytes), &bytesWritten,
+ &overlapped)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
+ lastError);
+ } else {
+ bytes_written = bytesWritten;
+ }
+
+ return s;
+}
+
+// See comments for pwrite above
+IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
+ uint64_t offset, size_t& bytes_read) {
+ IOStatus s;
+ bytes_read = 0;
+
+ if (num_bytes > std::numeric_limits<DWORD>::max()) {
+ return IOStatus::InvalidArgument(
+ "num_bytes is too large for a single read: " + file_data->GetName());
+ }
+
+ OVERLAPPED overlapped = {0};
+ ULARGE_INTEGER offsetUnion;
+ offsetUnion.QuadPart = offset;
+
+ overlapped.Offset = offsetUnion.LowPart;
+ overlapped.OffsetHigh = offsetUnion.HighPart;
+
+ DWORD bytesRead = 0;
+
+ if (FALSE == ReadFile(file_data->GetFileHandle(), src,
+ static_cast<DWORD>(num_bytes), &bytesRead,
+ &overlapped)) {
+ auto lastError = GetLastError();
+ // EOF is OK with zero bytes read
+ if (lastError != ERROR_HANDLE_EOF) {
+ s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
+ lastError);
+ }
+ } else {
+ bytes_read = bytesRead;
+ }
+
+ return s;
+}
+
+// SetFileInformationByHandle() is capable of fast pre-allocates.
+// However, this does not change the file end position unless the file is
+// truncated and the pre-allocated space is not considered filled with zeros.
+IOStatus fallocate(const std::string& filename, HANDLE hFile,
+ uint64_t to_size) {
+ IOStatus status;
+
+ FILE_ALLOCATION_INFO alloc_info;
+ alloc_info.AllocationSize.QuadPart = to_size;
+
+ if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
+ sizeof(FILE_ALLOCATION_INFO))) {
+ auto lastError = GetLastError();
+ status = IOErrorFromWindowsError(
+ "Failed to pre-allocate space: " + filename, lastError);
+ }
+
+ return status;
+}
+
+IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
+ IOStatus status;
+
+ FILE_END_OF_FILE_INFO end_of_file;
+ end_of_file.EndOfFile.QuadPart = toSize;
+
+ if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
+ sizeof(FILE_END_OF_FILE_INFO))) {
+ auto lastError = GetLastError();
+ status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
+ lastError);
+ }
+
+ return status;
+}
+
+size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
+ size_t /*max_size*/) {
+ // Returning 0 is safe as it causes the table reader to generate a unique ID.
+ // This is suboptimal for performance as it prevents multiple table readers
+ // for the same file from sharing cached blocks. For example, if users have
+ // a low value for `max_open_files`, there can be many table readers opened
+ // for the same file.
+ //
+ // TODO: this is a temporarily solution as it is safe but not optimal for
+ // performance. For more details see discussion in
+ // https://github.com/facebook/rocksdb/pull/5844.
+ return 0;
+}
+
+WinFileData::WinFileData(const std::string& filename, HANDLE hFile,
+ bool direct_io)
+ : filename_(filename),
+ hFile_(hFile),
+ use_direct_io_(direct_io),
+ sector_size_(WinFileSystem::GetSectorSize(filename)) {}
+
+bool WinFileData::IsSectorAligned(const size_t off) const {
+ return (off & (sector_size_ - 1)) == 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// WinMmapReadableFile
+
+WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
+ HANDLE hFile, HANDLE hMap,
+ const void* mapped_region,
+ size_t length)
+ : WinFileData(fileName, hFile, false /* use_direct_io */),
+ hMap_(hMap),
+ mapped_region_(mapped_region),
+ length_(length) {}
+
+WinMmapReadableFile::~WinMmapReadableFile() {
+ BOOL ret __attribute__((__unused__));
+ ret = ::UnmapViewOfFile(mapped_region_);
+ assert(ret);
+
+ ret = ::CloseHandle(hMap_);
+ assert(ret);
+}
+
+IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n,
+ const IOOptions& /*options*/, Slice* result,
+ char* scratch,
+ IODebugContext* /*dbg*/) const {
+ IOStatus s;
+
+ if (offset > length_) {
+ *result = Slice();
+ return IOError(filename_, EINVAL);
+ } else if (offset + n > length_) {
+ n = length_ - static_cast<size_t>(offset);
+ }
+ *result = Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
+ return s;
+}
+
+IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
+ return IOStatus::OK();
+}
+
+size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(hFile_, id, max_size);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// WinMmapFile
+
+// Can only truncate or reserve to a sector size aligned if
+// used on files that are opened with Unbuffered I/O
+IOStatus WinMmapFile::TruncateFile(uint64_t toSize) {
+ return ftruncate(filename_, hFile_, toSize);
+}
+
+IOStatus WinMmapFile::UnmapCurrentRegion() {
+ IOStatus status;
+
+ if (mapped_begin_ != nullptr) {
+ if (!::UnmapViewOfFile(mapped_begin_)) {
+ status = IOErrorFromWindowsError(
+ "Failed to unmap file view: " + filename_, GetLastError());
+ }
+
+ // Move on to the next portion of the file
+ file_offset_ += view_size_;
+
+ // UnmapView automatically sends data to disk but not the metadata
+ // which is good and provides some equivalent of fdatasync() on Linux
+ // therefore, we donot need separate flag for metadata
+ mapped_begin_ = nullptr;
+ mapped_end_ = nullptr;
+ dst_ = nullptr;
+
+ last_sync_ = nullptr;
+ pending_sync_ = false;
+ }
+
+ return status;
+}
+
+IOStatus WinMmapFile::MapNewRegion(const IOOptions& options,
+ IODebugContext* dbg) {
+ IOStatus status;
+
+ assert(mapped_begin_ == nullptr);
+
+ size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
+
+ if (minDiskSize > reserved_size_) {
+ status = Allocate(file_offset_, view_size_, options, dbg);
+ if (!status.ok()) {
+ return status;
+ }
+ }
+
+ // Need to remap
+ if (hMap_ == NULL || reserved_size_ > mapping_size_) {
+ if (hMap_ != NULL) {
+ // Unmap the previous one
+ BOOL ret __attribute__((__unused__));
+ ret = ::CloseHandle(hMap_);
+ assert(ret);
+ hMap_ = NULL;
+ }
+
+ ULARGE_INTEGER mappingSize;
+ mappingSize.QuadPart = reserved_size_;
+
+ hMap_ = CreateFileMappingA(
+ hFile_,
+ NULL, // Security attributes
+ PAGE_READWRITE, // There is not a write only mode for mapping
+ mappingSize.HighPart, // Enable mapping the whole file but the actual
+ // amount mapped is determined by MapViewOfFile
+ mappingSize.LowPart,
+ NULL); // Mapping name
+
+ if (NULL == hMap_) {
+ return IOErrorFromWindowsError(
+ "WindowsMmapFile failed to create file mapping for: " + filename_,
+ GetLastError());
+ }
+
+ mapping_size_ = reserved_size_;
+ }
+
+ ULARGE_INTEGER offset;
+ offset.QuadPart = file_offset_;
+
+ // View must begin at the granularity aligned offset
+ mapped_begin_ = reinterpret_cast<char*>(
+ MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
+ view_size_, NULL));
+
+ if (!mapped_begin_) {
+ status = IOErrorFromWindowsError(
+ "WindowsMmapFile failed to map file view: " + filename_,
+ GetLastError());
+ } else {
+ mapped_end_ = mapped_begin_ + view_size_;
+ dst_ = mapped_begin_;
+ last_sync_ = mapped_begin_;
+ pending_sync_ = false;
+ }
+ return status;
+}
+
+IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
+ return fallocate(filename_, hFile_, spaceToReserve);
+}
+
+WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
+ size_t page_size, size_t allocation_granularity,
+ const FileOptions& options)
+ : WinFileData(fname, hFile, false),
+ FSWritableFile(options),
+ hMap_(NULL),
+ page_size_(page_size),
+ allocation_granularity_(allocation_granularity),
+ reserved_size_(0),
+ mapping_size_(0),
+ view_size_(0),
+ mapped_begin_(nullptr),
+ mapped_end_(nullptr),
+ dst_(nullptr),
+ last_sync_(nullptr),
+ file_offset_(0),
+ pending_sync_(false) {
+ // Allocation granularity must be obtained from GetSystemInfo() and must be
+ // a power of two.
+ assert(allocation_granularity > 0);
+ assert((allocation_granularity & (allocation_granularity - 1)) == 0);
+
+ assert(page_size > 0);
+ assert((page_size & (page_size - 1)) == 0);
+
+ // Only for memory mapped writes
+ assert(options.use_mmap_writes);
+
+ // View size must be both the multiple of allocation_granularity AND the
+ // page size and the granularity is usually a multiple of a page size.
+ const size_t viewSize =
+ 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
+ view_size_ = Roundup(viewSize, allocation_granularity_);
+}
+
+WinMmapFile::~WinMmapFile() {
+ if (hFile_) {
+ this->Close(IOOptions(), nullptr);
+ }
+}
+
+IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options,
+ IODebugContext* dbg) {
+ const char* src = data.data();
+ size_t left = data.size();
+
+ while (left > 0) {
+ assert(mapped_begin_ <= dst_);
+ size_t avail = mapped_end_ - dst_;
+
+ if (avail == 0) {
+ IOStatus s = UnmapCurrentRegion();
+ if (s.ok()) {
+ s = MapNewRegion(options, dbg);
+ }
+
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ size_t n = std::min(left, avail);
+ memcpy(dst_, src, n);
+ dst_ += n;
+ src += n;
+ left -= n;
+ pending_sync_ = true;
+ }
+ }
+
+ // Now make sure that the last partial page is padded with zeros if needed
+ size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
+ if (bytesToPad > 0) {
+ memset(dst_, 0, bytesToPad);
+ }
+
+ return IOStatus::OK();
+}
+
+// Means Close() will properly take care of truncate
+// and it does not need any additional information
+IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) {
+ IOStatus s;
+
+ assert(NULL != hFile_);
+
+ // We truncate to the precise size so no
+ // uninitialized data at the end. SetEndOfFile
+ // which we use does not write zeros and it is good.
+ uint64_t targetSize = GetFileSize(options, dbg);
+
+ if (mapped_begin_ != nullptr) {
+ // Sync before unmapping to make sure everything
+ // is on disk and there is not a lazy writing
+ // so we are deterministic with the tests
+ Sync(options, dbg);
+ s = UnmapCurrentRegion();
+ }
+
+ if (NULL != hMap_) {
+ BOOL ret = ::CloseHandle(hMap_);
+ if (!ret && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to Close mapping for file: " + filename_, lastError);
+ }
+
+ hMap_ = NULL;
+ }
+
+ if (hFile_ != NULL) {
+ TruncateFile(targetSize);
+
+ BOOL ret = ::CloseHandle(hFile_);
+ hFile_ = NULL;
+
+ if (!ret && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to close file map handle: " + filename_, lastError);
+ }
+ }
+
+ return s;
+}
+
+IOStatus WinMmapFile::Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+// Flush only data
+IOStatus WinMmapFile::Sync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+
+ // Some writes occurred since last sync
+ if (dst_ > last_sync_) {
+ assert(mapped_begin_);
+ assert(dst_);
+ assert(dst_ > mapped_begin_);
+ assert(dst_ < mapped_end_);
+
+ size_t page_begin =
+ TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
+ size_t page_end =
+ TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
+
+ // Flush only the amount of that is a multiple of pages
+ if (!::FlushViewOfFile(mapped_begin_ + page_begin,
+ (page_end - page_begin) + page_size_)) {
+ s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
+ GetLastError());
+ } else {
+ last_sync_ = dst_;
+ }
+ }
+
+ return s;
+}
+
+/**
+ * Flush data as well as metadata to stable storage.
+ */
+IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+ IOStatus s = Sync(options, dbg);
+
+ // Flush metadata
+ if (s.ok() && pending_sync_) {
+ if (!::FlushFileBuffers(hFile_)) {
+ s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
+ GetLastError());
+ }
+ pending_sync_ = false;
+ }
+
+ return s;
+}
+
+/**
+ * Get the size of valid data in the file. This will not match the
+ * size that is returned from the filesystem because we use mmap
+ * to extend file by map_size every time.
+ */
+uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ size_t used = dst_ - mapped_begin_;
+ return file_offset_ + used;
+}
+
+IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) {
+ return IOStatus::OK();
+}
+
+IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus status;
+ TEST_KILL_RANDOM("WinMmapFile::Allocate");
+
+ // Make sure that we reserve an aligned amount of space
+ // since the reservation block size is driven outside so we want
+ // to check if we are ok with reservation here
+ size_t spaceToReserve =
+ Roundup(static_cast<size_t>(offset + len), view_size_);
+ // Nothing to do
+ if (spaceToReserve <= reserved_size_) {
+ return status;
+ }
+
+ IOSTATS_TIMER_GUARD(allocate_nanos);
+ status = PreallocateInternal(spaceToReserve);
+ if (status.ok()) {
+ reserved_size_ = spaceToReserve;
+ }
+ return status;
+}
+
+size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(hFile_, id, max_size);
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+// WinSequentialFile
+
+WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
+ const FileOptions& options)
+ : WinFileData(fname, f, options.use_direct_reads) {}
+
+WinSequentialFile::~WinSequentialFile() {
+ assert(hFile_ != INVALID_HANDLE_VALUE);
+}
+
+IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
+ Slice* result, char* scratch,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ size_t r = 0;
+
+ assert(result != nullptr);
+ if (WinFileData::use_direct_io()) {
+ return IOStatus::NotSupported("Read() does not support direct_io");
+ }
+
+ // Windows ReadFile API accepts a DWORD.
+ // While it is possible to read in a loop if n is too big
+ // it is an unlikely case.
+ if (n > std::numeric_limits<DWORD>::max()) {
+ return IOStatus::InvalidArgument("n is too big for a single ReadFile: " +
+ filename_);
+ }
+
+ DWORD bytesToRead =
+ static_cast<DWORD>(n); // cast is safe due to the check above
+ DWORD bytesRead = 0;
+ BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
+ if (ret != FALSE) {
+ r = bytesRead;
+ } else {
+ auto lastError = GetLastError();
+ if (lastError != ERROR_HANDLE_EOF) {
+ s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError);
+ }
+ }
+
+ *result = Slice(scratch, r);
+ return s;
+}
+
+IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
+ uint64_t offset,
+ size_t& bytes_read) const {
+ return pread(this, src, numBytes, offset, bytes_read);
+}
+
+IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n,
+ const IOOptions& /*opts*/,
+ Slice* result, char* scratch,
+ IODebugContext* /*dbg*/) {
+ if (!WinFileData::use_direct_io()) {
+ return IOStatus::NotSupported("This function is only used for direct_io");
+ }
+
+ assert(IsSectorAligned(static_cast<size_t>(offset)));
+ assert(IsSectorAligned(static_cast<size_t>(n)));
+
+ size_t bytes_read = 0; // out param
+ IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset,
+ bytes_read);
+ *result = Slice(scratch, bytes_read);
+ return s;
+}
+
+IOStatus WinSequentialFile::Skip(uint64_t n) {
+ // Can't handle more than signed max as SetFilePointerEx accepts a signed
+ // 64-bit integer. As such it is a highly unlikley case to have n so large.
+ if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
+ return IOStatus::InvalidArgument(
+ "n is too large for a single SetFilePointerEx() call" + filename_);
+ }
+
+ LARGE_INTEGER li;
+ li.QuadPart = static_cast<LONGLONG>(n); // cast is safe due to the check
+ // above
+ BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
+ if (ret == FALSE) {
+ auto lastError = GetLastError();
+ return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
+ lastError);
+ }
+ return IOStatus::OK();
+}
+
+IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
+ return IOStatus::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/// WinRandomAccessBase
+
+inline IOStatus WinRandomAccessImpl::PositionedReadInternal(
+ char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const {
+ return pread(file_base_, src, numBytes, offset, bytes_read);
+}
+
+inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
+ size_t alignment,
+ const FileOptions& options)
+ : file_base_(file_base),
+ alignment_(std::max(alignment, file_base->GetSectorSize())) {
+ assert(!options.use_mmap_reads);
+}
+
+inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n,
+ Slice* result,
+ char* scratch) const {
+ // Check buffer alignment
+ if (file_base_->use_direct_io()) {
+ assert(file_base_->IsSectorAligned(static_cast<size_t>(offset)));
+ assert(IsAligned(alignment_, scratch));
+ }
+
+ if (n == 0) {
+ *result = Slice(scratch, 0);
+ return IOStatus::OK();
+ }
+
+ size_t bytes_read = 0;
+ IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read);
+ *result = Slice(scratch, bytes_read);
+ return s;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// WinRandomAccessFile
+
+WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
+ size_t alignment,
+ const FileOptions& options)
+ : WinFileData(fname, hFile, options.use_direct_reads),
+ WinRandomAccessImpl(this, alignment, options) {}
+
+WinRandomAccessFile::~WinRandomAccessFile() {}
+
+IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n,
+ const IOOptions& /*options*/, Slice* result,
+ char* scratch,
+ IODebugContext* /*dbg*/) const {
+ return ReadImpl(offset, n, result, scratch);
+}
+
+IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
+ return IOStatus::OK();
+}
+
+size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
+}
+
+size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
+ return GetAlignment();
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// WinWritableImpl
+//
+
+inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
+ return fallocate(file_data_->GetName(), file_data_->GetFileHandle(),
+ spaceToReserve);
+}
+
+inline WinWritableImpl::WinWritableImpl(WinFileData* file_data,
+ size_t alignment)
+ : file_data_(file_data),
+ alignment_(std::max(alignment, file_data->GetSectorSize())),
+ next_write_offset_(0),
+ reservedsize_(0) {
+ // Query current position in case ReopenWritableFile is called
+ // This position is only important for buffered writes
+ // for unbuffered writes we explicitely specify the position.
+ LARGE_INTEGER zero_move;
+ zero_move.QuadPart = 0; // Do not move
+ LARGE_INTEGER pos;
+ pos.QuadPart = 0;
+ BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
+ FILE_CURRENT);
+ // Querying no supped to fail
+ if (ret != 0) {
+ next_write_offset_ = pos.QuadPart;
+ } else {
+ assert(false);
+ }
+}
+
+inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) {
+ IOStatus s;
+
+ if (data.size() > std::numeric_limits<DWORD>::max()) {
+ return IOStatus::InvalidArgument("data is too long for a single write" +
+ file_data_->GetName());
+ }
+
+ size_t bytes_written = 0; // out param
+
+ if (file_data_->use_direct_io()) {
+ // With no offset specified we are appending
+ // to the end of the file
+ assert(file_data_->IsSectorAligned(next_write_offset_));
+ assert(file_data_->IsSectorAligned(data.size()));
+ assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
+ s = pwrite(file_data_, data, next_write_offset_, bytes_written);
+ } else {
+ DWORD bytesWritten = 0;
+ if (!WriteFile(file_data_->GetFileHandle(), data.data(),
+ static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "Failed to WriteFile: " + file_data_->GetName(), lastError);
+ } else {
+ bytes_written = bytesWritten;
+ }
+ }
+
+ if (s.ok()) {
+ if (bytes_written == data.size()) {
+ // This matters for direct_io cases where
+ // we rely on the fact that next_write_offset_
+ // is sector aligned
+ next_write_offset_ += bytes_written;
+ } else {
+ s = IOStatus::IOError("Failed to write all bytes: " +
+ file_data_->GetName());
+ }
+ }
+
+ return s;
+}
+
+inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data,
+ uint64_t offset) {
+ if (file_data_->use_direct_io()) {
+ assert(file_data_->IsSectorAligned(static_cast<size_t>(offset)));
+ assert(file_data_->IsSectorAligned(data.size()));
+ assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
+ }
+
+ size_t bytes_written = 0;
+ IOStatus s = pwrite(file_data_, data, offset, bytes_written);
+
+ if (s.ok()) {
+ if (bytes_written == data.size()) {
+ // For sequential write this would be simple
+ // size extension by data.size()
+ uint64_t write_end = offset + bytes_written;
+ if (write_end >= next_write_offset_) {
+ next_write_offset_ = write_end;
+ }
+ } else {
+ s = IOStatus::IOError("Failed to write all of the requested data: " +
+ file_data_->GetName());
+ }
+ }
+ return s;
+}
+
+inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) {
+ // It is tempting to check for the size for sector alignment
+ // but truncation may come at the end and there is not a requirement
+ // for this to be sector aligned so long as we do not attempt to write
+ // after that. The interface docs state that the behavior is undefined
+ // in that case.
+ IOStatus s =
+ ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size);
+
+ if (s.ok()) {
+ next_write_offset_ = size;
+ }
+ return s;
+}
+
+inline IOStatus WinWritableImpl::CloseImpl() {
+ IOStatus s;
+
+ auto hFile = file_data_->GetFileHandle();
+ assert(INVALID_HANDLE_VALUE != hFile);
+
+ if (!::FlushFileBuffers(hFile)) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "FlushFileBuffers failed at Close() for: " + file_data_->GetName(),
+ lastError);
+ }
+
+ if (!file_data_->CloseFile() && s.ok()) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "CloseHandle failed for: " + file_data_->GetName(), lastError);
+ }
+ return s;
+}
+
+inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s;
+ if (!::FlushFileBuffers(file_data_->GetFileHandle())) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError(
+ "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(),
+ lastError);
+ }
+ return s;
+}
+
+inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
+ IOStatus status;
+ TEST_KILL_RANDOM("WinWritableFile::Allocate");
+
+ // Make sure that we reserve an aligned amount of space
+ // since the reservation block size is driven outside so we want
+ // to check if we are ok with reservation here
+ size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len),
+ static_cast<size_t>(alignment_));
+ // Nothing to do
+ if (spaceToReserve <= reservedsize_) {
+ return status;
+ }
+
+ IOSTATS_TIMER_GUARD(allocate_nanos);
+ status = PreallocateInternal(spaceToReserve);
+ if (status.ok()) {
+ reservedsize_ = spaceToReserve;
+ }
+ return status;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// WinWritableFile
+
+WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
+ size_t alignment, size_t /* capacity */,
+ const FileOptions& options)
+ : WinFileData(fname, hFile, options.use_direct_writes),
+ WinWritableImpl(this, alignment),
+ FSWritableFile(options) {
+ assert(!options.use_mmap_writes);
+}
+
+WinWritableFile::~WinWritableFile() {}
+
+// Indicates if the class makes use of direct I/O
+bool WinWritableFile::use_direct_io() const {
+ return WinFileData::use_direct_io();
+}
+
+size_t WinWritableFile::GetRequiredBufferAlignment() const {
+ return static_cast<size_t>(GetAlignment());
+}
+
+IOStatus WinWritableFile::Append(const Slice& data,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return AppendImpl(data);
+}
+
+IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return PositionedAppendImpl(data, offset);
+}
+
+// Need to implement this so the file is truncated correctly
+// when buffered and unbuffered mode
+IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return TruncateImpl(size);
+}
+
+IOStatus WinWritableFile::Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return CloseImpl();
+}
+
+// write out the cached data to the OS cache
+// This is now taken care of the WritableFileWriter
+IOStatus WinWritableFile::Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+ return SyncImpl(options, dbg);
+}
+
+IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+ return SyncImpl(options, dbg);
+}
+
+bool WinWritableFile::IsSyncThreadSafe() const { return true; }
+
+uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return GetFileNextWriteOffset();
+}
+
+IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return AllocateImpl(offset, len);
+}
+
+size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
+}
+
+/////////////////////////////////////////////////////////////////////////
+/// WinRandomRWFile
+
+WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
+ size_t alignment, const FileOptions& options)
+ : WinFileData(fname, hFile,
+ options.use_direct_reads && options.use_direct_writes),
+ WinRandomAccessImpl(this, alignment, options),
+ WinWritableImpl(this, alignment) {}
+
+bool WinRandomRWFile::use_direct_io() const {
+ return WinFileData::use_direct_io();
+}
+
+size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
+ assert(WinRandomAccessImpl::GetAlignment() ==
+ WinWritableImpl::GetAlignment());
+ return static_cast<size_t>(WinRandomAccessImpl::GetAlignment());
+}
+
+IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data,
+ const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return PositionedAppendImpl(data, offset);
+}
+
+IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n,
+ const IOOptions& /*options*/, Slice* result,
+ char* scratch, IODebugContext* /*dbg*/) const {
+ return ReadImpl(offset, n, result, scratch);
+}
+
+IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+ return SyncImpl(options, dbg);
+}
+
+IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return CloseImpl();
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// WinMemoryMappedBufer
+WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
+ BOOL ret
+#if defined(_MSC_VER)
+ = FALSE;
+#else
+ __attribute__((__unused__));
+#endif
+ if (base_ != nullptr) {
+ ret = ::UnmapViewOfFile(base_);
+ assert(ret);
+ base_ = nullptr;
+ }
+ if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
+ ret = ::CloseHandle(map_handle_);
+ assert(ret);
+ map_handle_ = NULL;
+ }
+ if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
+ ret = ::CloseHandle(file_handle_);
+ assert(ret);
+ file_handle_ = NULL;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// WinDirectory
+
+IOStatus WinDirectory::Fsync(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ return IOStatus::OK();
+}
+
+IOStatus WinDirectory::Close(const IOOptions& /*options*/,
+ IODebugContext* /*dbg*/) {
+ IOStatus s = IOStatus::OK();
+ BOOL ret __attribute__((__unused__));
+ if (handle_ != INVALID_HANDLE_VALUE) {
+ ret = ::CloseHandle(handle_);
+ if (!ret) {
+ auto lastError = GetLastError();
+ s = IOErrorFromWindowsError("Directory closes failed for : " + GetName(),
+ lastError);
+ }
+ handle_ = NULL;
+ }
+ return s;
+}
+
+size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
+ return GetUniqueIdFromFile(handle_, id, max_size);
+}
+//////////////////////////////////////////////////////////////////////////
+/// WinFileLock
+
+WinFileLock::~WinFileLock() {
+ BOOL ret __attribute__((__unused__));
+ ret = ::CloseHandle(hFile_);
+ assert(ret);
+}
+
+} // namespace port
+} // namespace ROCKSDB_NAMESPACE
+
+#endif