From 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sat, 27 Apr 2024 20:24:20 +0200
Subject: Adding upstream version 14.2.21.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/rocksdb/utilities/env_librados.cc | 1489 +++++++++++++++++++++++++++++++++
 1 file changed, 1489 insertions(+)
 create mode 100644 src/rocksdb/utilities/env_librados.cc

(limited to 'src/rocksdb/utilities/env_librados.cc')
diff --git a/src/rocksdb/utilities/env_librados.cc b/src/rocksdb/utilities/env_librados.cc
new file mode 100644
index 00000000..753444da
--- /dev/null
+++ b/src/rocksdb/utilities/env_librados.cc
@@ -0,0 +1,1489 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "rocksdb/utilities/env_librados.h"
+#include "util/random.h"
+#include <mutex>
+#include <cstdlib>
+
+namespace rocksdb {
+/* GLOBAL DIFINE */
+// #define DEBUG
+#ifdef DEBUG
+#include <cstdio>
+#include <sys/syscall.h>
+#include <unistd.h>
+#define LOG_DEBUG(...)  do{\
+    printf("[%ld:%s:%i:%s]", syscall(SYS_gettid), __FILE__, __LINE__, __FUNCTION__);\
+    printf(__VA_ARGS__);\
+  }while(0)
+#else
+#define LOG_DEBUG(...)
+#endif
+
+/* GLOBAL CONSTANT */
+const char *default_db_name     = "default_envlibrados_db";
+const char *default_pool_name   = "default_envlibrados_pool";
+const char *default_config_path = "CEPH_CONFIG_PATH";           // the env variable name of ceph configure file
+// maximum dir/file that can store in the fs
+const int MAX_ITEMS_IN_FS = 1 << 30;
+// root dir tag
+const std::string ROOT_DIR_KEY = "/";
+const std::string DIR_ID_VALUE = "<DIR>";
+
+/**
+ * @brief convert error code to status
+ * @details Convert internal linux error code to Status
+ *
+ * @param r [description]
+ * @return [description]
+ */
+Status err_to_status(int r)
+{
+  switch (r) {
+  case 0:
+    return Status::OK();
+  case -ENOENT:
+    return Status::IOError();
+  case -ENODATA:
+  case -ENOTDIR:
+    return Status::NotFound(Status::kNone);
+  case -EINVAL:
+    return Status::InvalidArgument(Status::kNone);
+  case -EIO:
+    return Status::IOError(Status::kNone);
+  default:
+    // FIXME :(
+    assert(0 == "unrecognized error code");
+    return Status::NotSupported(Status::kNone);
+  }
+}
+
+/**
+ * @brief split file path into dir path and file name
+ * @details
+ * Because rocksdb only need a 2-level structure (dir/file), all input path will be shortened to dir/file format
+ *  For example:
+ *    b/c => dir '/b', file 'c'
+ *    /a/b/c => dir '/b', file 'c'
+ *
+ * @param fn [description]
+ * @param dir [description]
+ * @param file [description]
+ */
+void split(const std::string &fn, std::string *dir, std::string *file) {
+  LOG_DEBUG("[IN]%s\n", fn.c_str());
+  int pos = fn.size() - 1;
+  while ('/' == fn[pos]) --pos;
+  size_t fstart = fn.rfind('/', pos);
+  *file = fn.substr(fstart + 1, pos - fstart);
+
+  pos = fstart;
+  while (pos >= 0 && '/' == fn[pos]) --pos;
+
+  if (pos < 0) {
+    *dir = "/";
+  } else {
+    size_t dstart = fn.rfind('/', pos);
+    *dir = fn.substr(dstart + 1, pos - dstart);
+    *dir = std::string("/") + *dir;
+  }
+
+  LOG_DEBUG("[OUT]%s | %s\n", dir->c_str(), file->c_str());
+}
+
+// A file abstraction for reading sequentially through a file
+class LibradosSequentialFile : public SequentialFile {
+  librados::IoCtx * _io_ctx;
+  std::string _fid;
+  std::string _hint;
+  int _offset;
+public:
+  LibradosSequentialFile(librados::IoCtx * io_ctx, std::string fid, std::string hint):
+    _io_ctx(io_ctx), _fid(fid), _hint(hint), _offset(0) {}
+
+  ~LibradosSequentialFile() {}
+
+  /**
+   * @brief read file
+   * @details
+   *  Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+   *  written by this routine.  Sets "*result" to the data that was
+   *  read (including if fewer than "n" bytes were successfully read).
+   *  May set "*result" to point at data in "scratch[0..n-1]", so
+   *  "scratch[0..n-1]" must be live when "*result" is used.
+   *  If an error was encountered, returns a non-OK status.
+   *
+   *  REQUIRES: External synchronization
+   *
+   * @param n [description]
+   * @param result [description]
+   * @param scratch [description]
+   * @return [description]
+   */
+  Status Read(size_t n, Slice* result, char* scratch) {
+    LOG_DEBUG("[IN]%i\n", (int)n);
+    librados::bufferlist buffer;
+    Status s;
+    int r = _io_ctx->read(_fid, buffer, n, _offset);
+    if (r >= 0) {
+      buffer.copy(0, r, scratch);
+      *result = Slice(scratch, r);
+      _offset += r;
+      s = Status::OK();
+    } else {
+      s = err_to_status(r);
+      if (s == Status::IOError()) {
+        *result = Slice();
+        s = Status::OK();
+      }
+    }
+    LOG_DEBUG("[OUT]%s, %i, %s\n", s.ToString().c_str(), (int)r, buffer.c_str());
+    return s;
+  }
+
+  /**
+   * @brief skip "n" bytes from the file
+   * @details
+   *  Skip "n" bytes from the file. This is guaranteed to be no
+   *  slower that reading the same data, but may be faster.
+   *
+   *  If end of file is reached, skipping will stop at the end of the
+   *  file, and Skip will return OK.
+   *
+   *  REQUIRES: External synchronization
+   *
+   * @param n [description]
+   * @return [description]
+   */
+  Status Skip(uint64_t n) {
+    _offset += n;
+    return Status::OK();
+  }
+
+  /**
+   * @brief noop
+   * @details
+   *  rocksdb has it's own caching capabilities that we should be able to use,
+   *  without relying on a cache here. This can safely be a no-op.
+   *
+   * @param offset [description]
+   * @param length [description]
+   *
+   * @return [description]
+   */
+  Status InvalidateCache(size_t offset, size_t length) {
+    return Status::OK();
+  }
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class LibradosRandomAccessFile : public RandomAccessFile {
+  librados::IoCtx * _io_ctx;
+  std::string _fid;
+  std::string _hint;
+public:
+  LibradosRandomAccessFile(librados::IoCtx * io_ctx, std::string fid, std::string hint):
+    _io_ctx(io_ctx), _fid(fid), _hint(hint) {}
+
+  ~LibradosRandomAccessFile() {}
+
+  /**
+   * @brief read file
+   * @details similar to LibradosSequentialFile::Read
+   *
+   * @param offset [description]
+   * @param n [description]
+   * @param result [description]
+   * @param scratch [description]
+   * @return [description]
+   */
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const {
+    LOG_DEBUG("[IN]%i\n", (int)n);
+    librados::bufferlist buffer;
+    Status s;
+    int r = _io_ctx->read(_fid, buffer, n, offset);
+    if (r >= 0) {
+      buffer.copy(0, r, scratch);
+      *result = Slice(scratch, r);
+      s = Status::OK();
+    } else {
+      s = err_to_status(r);
+      if (s == Status::IOError()) {
+        *result = Slice();
+        s = Status::OK();
+      }
+    }
+    LOG_DEBUG("[OUT]%s, %i, %s\n", s.ToString().c_str(), (int)r, buffer.c_str());
+    return s;
+  }
+
+  /**
+   * @brief [brief description]
+   * @details Get unique id for each file and guarantee this id is different for each file
+   *
+   * @param id [description]
+   * @param max_size max size of id, it shoud be larger than 16
+   *
+   * @return [description]
+   */
+  size_t GetUniqueId(char* id, size_t max_size) const {
+    // All fid has the same db_id prefix, so we need to ignore db_id prefix
+    size_t s = std::min(max_size, _fid.size());
+    strncpy(id, _fid.c_str() + (_fid.size() - s), s);
+    id[s - 1] = '\0';
+    return s;
+  };
+
+  //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+  void Hint(AccessPattern pattern) {
+    /* Do nothing */
+  }
+
+  /**
+   * @brief noop
+   * @details [long description]
+   *
+   * @param offset [description]
+   * @param length [description]
+   *
+   * @return [description]
+   */
+  Status InvalidateCache(size_t offset, size_t length) {
+    return Status::OK();
+  }
+};
+
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class LibradosWritableFile : public WritableFile {
+  librados::IoCtx * _io_ctx;
+  std::string _fid;
+  std::string _hint;
+  const EnvLibrados * const _env;
+
+  std::mutex _mutex;                 // used to protect modification of all following variables
+  librados::bufferlist _buffer;      // write buffer
+  uint64_t _buffer_size;             // write buffer size
+  uint64_t _file_size;               // this file size doesn't include buffer size
+
+  /**
+   * @brief assuming caller holds lock
+   * @details [long description]
+   * @return [description]
+   */
+  int _SyncLocked() {
+    // 1. sync append data to RADOS
+    int r = _io_ctx->append(_fid, _buffer, _buffer_size);
+    assert(r >= 0);
+
+    // 2. update local variables
+    if (0 == r) {
+      _buffer.clear();
+      _file_size += _buffer_size;
+      _buffer_size = 0;
+    }
+
+    return r;
+  }
+
+public:
+  LibradosWritableFile(librados::IoCtx * io_ctx,
+                       std::string fid,
+                       std::string hint,
+                       const EnvLibrados * const env)
+    : _io_ctx(io_ctx), _fid(fid), _hint(hint), _env(env), _buffer(), _buffer_size(0), _file_size(0) {
+    int ret = _io_ctx->stat(_fid, &_file_size, nullptr);
+
+    // if file not exist
+    if (ret < 0) {
+      _file_size = 0;
+    }
+  }
+
+  ~LibradosWritableFile() {
+    // sync before closeing writable file
+    Sync();
+  }
+
+  /**
+   * @brief append data to file
+   * @details
+   *  Append will save all written data in buffer util buffer size
+   *  reaches buffer max size. Then, it will write buffer into rados
+   *
+   * @param data [description]
+   * @return [description]
+   */
+  Status Append(const Slice& data) {
+    // append buffer
+    LOG_DEBUG("[IN] %i | %s\n", (int)data.size(), data.data());
+    int r = 0;
+
+    std::lock_guard<std::mutex> lock(_mutex);
+    _buffer.append(data.data(), data.size());
+    _buffer_size += data.size();
+
+    if (_buffer_size > _env->_write_buffer_size) {
+      r = _SyncLocked();
+    }
+
+    LOG_DEBUG("[OUT] %i\n", r);
+    return err_to_status(r);
+  }
+
+  /**
+   * @brief not supported
+   * @details [long description]
+   * @return [description]
+   */
+  Status PositionedAppend(
+    const Slice& /* data */,
+    uint64_t /* offset */) {
+    return Status::NotSupported();
+  }
+
+  /**
+   * @brief truncate file to assigned size
+   * @details [long description]
+   *
+   * @param size [description]
+   * @return [description]
+   */
+  Status Truncate(uint64_t size) {
+    LOG_DEBUG("[IN]%lld|%lld|%lld\n", (long long)size, (long long)_file_size, (long long)_buffer_size);
+    int r = 0;
+
+    std::lock_guard<std::mutex> lock(_mutex);
+    if (_file_size > size) {
+      r = _io_ctx->trunc(_fid, size);
+
+      if (r == 0) {
+        _buffer.clear();
+        _buffer_size = 0;
+        _file_size = size;
+      }
+    } else if (_file_size == size) {
+      _buffer.clear();
+      _buffer_size = 0;
+    } else {
+      librados::bufferlist tmp;
+      tmp.claim(_buffer);
+      _buffer.substr_of(tmp, 0, size - _file_size);
+      _buffer_size = size - _file_size;
+    }
+
+    LOG_DEBUG("[OUT] %i\n", r);
+    return err_to_status(r);
+  }
+
+  /**
+   * @brief close file
+   * @details [long description]
+   * @return [description]
+   */
+  Status Close() {
+    LOG_DEBUG("%s | %lld | %lld\n", _hint.c_str(), (long long)_buffer_size, (long long)_file_size);
+    return Sync();
+  }
+
+  /**
+   * @brief flush file,
+   * @details initiate an aio write and not wait
+   *
+   * @return [description]
+   */
+  Status Flush() {
+    librados::AioCompletion *write_completion = librados::Rados::aio_create_completion();
+    int r = 0;
+
+    std::lock_guard<std::mutex> lock(_mutex);
+    r = _io_ctx->aio_append(_fid, write_completion, _buffer, _buffer_size);
+
+    if (0 == r) {
+      _file_size += _buffer_size;
+      _buffer.clear();
+      _buffer_size = 0;
+    }
+
+    write_completion->release();
+
+    return err_to_status(r);
+  }
+
+  /**
+   * @brief write buffer data to rados
+   * @details initiate an aio write and wait for result
+   * @return [description]
+   */
+  Status Sync() { // sync data
+    int r = 0;
+
+    std::lock_guard<std::mutex> lock(_mutex);
+    if (_buffer_size > 0) {
+      r = _SyncLocked();
+    }
+
+    return err_to_status(r);
+  }
+
+  /**
+   * @brief [brief description]
+   * @details [long description]
+   * @return true if Sync() and Fsync() are safe to call concurrently with Append()and Flush().
+   */
+  bool IsSyncThreadSafe() const {
+    return true;
+  }
+
+  /**
+   * @brief Indicates the upper layers if the current WritableFile implementation uses direct IO.
+   * @details [long description]
+   * @return [description]
+   */
+  bool use_direct_io() const {
+    return false;
+  }
+
+  /**
+   * @brief Get file size
+   * @details
+   *  This API will use cached file_size.
+   * @return [description]
+   */
+  uint64_t GetFileSize() {
+    LOG_DEBUG("%lld|%lld\n", (long long)_buffer_size, (long long)_file_size);
+
+    std::lock_guard<std::mutex> lock(_mutex);
+    int file_size = _file_size + _buffer_size;
+
+    return file_size;
+  }
+
+  /**
+   * @brief For documentation, refer to RandomAccessFile::GetUniqueId()
+   * @details [long description]
+   *
+   * @param id [description]
+   * @param max_size [description]
+   *
+   * @return [description]
+   */
+  size_t GetUniqueId(char* id, size_t max_size) const {
+    // All fid has the same db_id prefix, so we need to ignore db_id prefix
+    size_t s = std::min(max_size, _fid.size());
+    strncpy(id, _fid.c_str() + (_fid.size() - s), s);
+    id[s - 1] = '\0';
+    return s;
+  }
+
+  /**
+   * @brief noop
+   * @details [long description]
+   *
+   * @param offset [description]
+   * @param length [description]
+   *
+   * @return [description]
+   */
+  Status InvalidateCache(size_t offset, size_t length) {
+    return Status::OK();
+  }
+
+  using WritableFile::RangeSync;
+  /**
+   * @brief No RangeSync support, just call Sync()
+   * @details [long description]
+   *
+   * @param offset [description]
+   * @param nbytes [description]
+   *
+   * @return [description]
+   */
+  Status RangeSync(off_t offset, off_t nbytes) {
+    return Sync();
+  }
+
+protected:
+  using WritableFile::Allocate;
+  /**
+   * @brief noop
+   * @details [long description]
+   *
+   * @param offset [description]
+   * @param len [description]
+   *
+   * @return [description]
+   */
+  Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+};
+
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class LibradosDirectory : public Directory {
+  librados::IoCtx * _io_ctx;
+  std::string _fid;
+public:
+  explicit LibradosDirectory(librados::IoCtx * io_ctx, std::string fid):
+    _io_ctx(io_ctx), _fid(fid) {}
+
+  // Fsync directory. Can be called concurrently from multiple threads.
+  Status Fsync() {
+    return Status::OK();
+  }
+};
+
+// Identifies a locked file.
+// This is exclusive lock and can't nested lock by same thread
+class LibradosFileLock : public FileLock {
+  librados::IoCtx * _io_ctx;
+  const std::string _obj_name;
+  const std::string _lock_name;
+  const std::string _cookie;
+  int lock_state;
+public:
+  LibradosFileLock(
+    librados::IoCtx * io_ctx,
+    const std::string obj_name):
+    _io_ctx(io_ctx),
+    _obj_name(obj_name),
+    _lock_name("lock_name"),
+    _cookie("cookie") {
+
+    // TODO: the lock will never expire. It may cause problem if the process crash or abnormally exit.
+    while (!_io_ctx->lock_exclusive(
+             _obj_name,
+             _lock_name,
+             _cookie,
+             "description", nullptr, 0));
+  }
+
+  ~LibradosFileLock() {
+    _io_ctx->unlock(_obj_name, _lock_name, _cookie);
+  }
+};
+
+
+// --------------------
+// --- EnvLibrados ----
+// --------------------
+/**
+ * @brief EnvLibrados ctor
+ * @details [long description]
+ *
+ * @param db_name unique database name
+ * @param config_path the configure file path for rados
+ */
+EnvLibrados::EnvLibrados(const std::string& db_name,
+                         const std::string& config_path,
+                         const std::string& db_pool)
+  : EnvLibrados("client.admin",
+                "ceph",
+                0,
+                db_name,
+                config_path,
+                db_pool,
+                "/wal",
+                db_pool,
+                1 << 20) {}
+
+/**
+ * @brief EnvLibrados ctor
+ * @details [long description]
+ *
+ * @param client_name       first 3 parameters is for RADOS client init
+ * @param cluster_name
+ * @param flags
+ * @param db_name           unique database name, used as db_id key
+ * @param config_path the   configure file path for rados
+ * @param db_pool the pool  for db data
+ * @param wal_pool the pool for WAL data
+ * @param write_buffer_size WritableFile buffer max size
+ */
+EnvLibrados::EnvLibrados(const std::string& client_name,
+                         const std::string& cluster_name,
+                         const uint64_t flags,
+                         const std::string& db_name,
+                         const std::string& config_path,
+                         const std::string& db_pool,
+                         const std::string& wal_dir,
+                         const std::string& wal_pool,
+                         const uint64_t write_buffer_size)
+  : EnvWrapper(Env::Default()),
+    _client_name(client_name),
+    _cluster_name(cluster_name),
+    _flags(flags),
+    _db_name(db_name),
+    _config_path(config_path),
+    _db_pool_name(db_pool),
+    _wal_dir(wal_dir),
+    _wal_pool_name(wal_pool),
+    _write_buffer_size(write_buffer_size) {
+  int ret = 0;
+
+  // 1. create a Rados object and initialize it
+  ret = _rados.init2(_client_name.c_str(), _cluster_name.c_str(), _flags); // just use the client.admin keyring
+  if (ret < 0) { // let's handle any error that might have come back
+    std::cerr << "couldn't initialize rados! error " << ret << std::endl;
+    ret = EXIT_FAILURE;
+    goto out;
+  }
+
+  // 2. read configure file
+  ret = _rados.conf_read_file(_config_path.c_str());
+  if (ret < 0) {
+    // This could fail if the config file is malformed, but it'd be hard.
+    std::cerr << "failed to parse config file " << _config_path
+              << "! error" << ret << std::endl;
+    ret = EXIT_FAILURE;
+    goto out;
+  }
+
+  // 3. we actually connect to the cluster
+  ret = _rados.connect();
+  if (ret < 0) {
+    std::cerr << "couldn't connect to cluster! error " << ret << std::endl;
+    ret = EXIT_FAILURE;
+    goto out;
+  }
+
+  // 4. create db_pool if not exist
+  ret = _rados.pool_create(_db_pool_name.c_str());
+  if (ret < 0 && ret != -EEXIST && ret !=  -EPERM) {
+    std::cerr << "couldn't create pool! error " << ret << std::endl;
+    goto out;
+  }
+
+  // 5. create db_pool_ioctx
+  ret = _rados.ioctx_create(_db_pool_name.c_str(), _db_pool_ioctx);
+  if (ret < 0) {
+    std::cerr << "couldn't set up ioctx! error " << ret << std::endl;
+    ret = EXIT_FAILURE;
+    goto out;
+  }
+
+  // 6. create wal_pool if not exist
+  ret = _rados.pool_create(_wal_pool_name.c_str());
+  if (ret < 0 && ret != -EEXIST && ret !=  -EPERM) {
+    std::cerr << "couldn't create pool! error " << ret << std::endl;
+    goto out;
+  }
+
+  // 7. create wal_pool_ioctx
+  ret = _rados.ioctx_create(_wal_pool_name.c_str(), _wal_pool_ioctx);
+  if (ret < 0) {
+    std::cerr << "couldn't set up ioctx! error " << ret << std::endl;
+    ret = EXIT_FAILURE;
+    goto out;
+  }
+
+  // 8. add root dir
+  _AddFid(ROOT_DIR_KEY, DIR_ID_VALUE);
+
+out:
+  LOG_DEBUG("rados connect result code : %i\n", ret);
+}
+
+/****************************************************
+  private functions to handle fid operation.
+  Dir also have fid, but the value is DIR_ID_VALUE
+****************************************************/
+
+/**
+ * @brief generate a new fid
+ * @details [long description]
+ * @return [description]
+ */
+std::string EnvLibrados::_CreateFid() {
+  return _db_name + "." + GenerateUniqueId();
+}
+
+/**
+ * @brief get fid
+ * @details [long description]
+ *
+ * @param fname [description]
+ * @param fid [description]
+ *
+ * @return
+ *  Status::OK()
+ *  Status::NotFound()
+ */
+Status EnvLibrados::_GetFid(
+  const std::string &fname,
+  std::string& fid) {
+  std::set<std::string> keys;
+  std::map<std::string, librados::bufferlist> kvs;
+  keys.insert(fname);
+  int r = _db_pool_ioctx.omap_get_vals_by_keys(_db_name, keys, &kvs);
+
+  if (0 == r && 0 == kvs.size()) {
+    return Status::NotFound();
+  } else if (0 == r && 0 != kvs.size()) {
+    fid.assign(kvs[fname].c_str(), kvs[fname].length());
+    return Status::OK();
+  } else {
+    return err_to_status(r);
+  }
+}
+
+/**
+ * @brief rename fid
+ * @details Only modify object in rados once,
+ * so this rename operation is atomic in term of rados
+ *
+ * @param old_fname [description]
+ * @param new_fname [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::_RenameFid(const std::string& old_fname,
+                               const std::string& new_fname) {
+  std::string fid;
+  Status s = _GetFid(old_fname, fid);
+
+  if (Status::OK() != s) {
+    return s;
+  }
+
+  librados::bufferlist bl;
+  std::set<std::string> keys;
+  std::map<std::string, librados::bufferlist> kvs;
+  librados::ObjectWriteOperation o;
+  bl.append(fid);
+  keys.insert(old_fname);
+  kvs[new_fname] = bl;
+  o.omap_rm_keys(keys);
+  o.omap_set(kvs);
+  int r = _db_pool_ioctx.operate(_db_name, &o);
+  return err_to_status(r);
+}
+
+/**
+ * @brief add <file path, fid> to metadata object. It may overwrite exist key.
+ * @details [long description]
+ *
+ * @param fname [description]
+ * @param fid [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::_AddFid(
+  const std::string& fname,
+  const std::string& fid) {
+  std::map<std::string, librados::bufferlist> kvs;
+  librados::bufferlist value;
+  value.append(fid);
+  kvs[fname] = value;
+  int r = _db_pool_ioctx.omap_set(_db_name, kvs);
+  return err_to_status(r);
+}
+
+/**
+ * @brief return subfile names of dir.
+ * @details
+ *  RocksDB has a 2-level structure, so all keys
+ *  that have dir as prefix are subfiles of dir.
+ *  So we can just return these files' name.
+ *
+ * @param dir [description]
+ * @param result [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::_GetSubFnames(
+  const std::string& dir,
+  std::vector<std::string> * result
+) {
+  std::string start_after(dir);
+  std::string filter_prefix(dir);
+  std::map<std::string, librados::bufferlist> kvs;
+  _db_pool_ioctx.omap_get_vals(_db_name,
+                               start_after, filter_prefix,
+                               MAX_ITEMS_IN_FS, &kvs);
+
+  result->clear();
+  for (auto i = kvs.begin(); i != kvs.end(); i++) {
+    result->push_back(i->first.substr(dir.size() + 1));
+  }
+  return Status::OK();
+}
+
+/**
+ * @brief delete key fname from metadata object
+ * @details [long description]
+ *
+ * @param fname [description]
+ * @return [description]
+ */
+Status EnvLibrados::_DelFid(
+  const std::string& fname) {
+  std::set<std::string> keys;
+  keys.insert(fname);
+  int r = _db_pool_ioctx.omap_rm_keys(_db_name, keys);
+  return err_to_status(r);
+}
+
+/**
+ * @brief get match IoCtx from _prefix_pool_map
+ * @details [long description]
+ *
+ * @param prefix [description]
+ * @return [description]
+ *
+ */
+librados::IoCtx* EnvLibrados::_GetIoctx(const std::string& fpath) {
+  auto is_prefix = [](const std::string & s1, const std::string & s2) {
+    auto it1 = s1.begin(), it2 = s2.begin();
+    while (it1 != s1.end() && it2 != s2.end() && *it1 == *it2) ++it1, ++it2;
+    return it1 == s1.end();
+  };
+
+  if (is_prefix(_wal_dir, fpath)) {
+    return &_wal_pool_ioctx;
+  } else {
+    return &_db_pool_ioctx;
+  }
+}
+
+/************************************************************
+                public functions
+************************************************************/
+/**
+ * @brief generate unique id
+ * @details Combine system time and random number.
+ * @return [description]
+ */
+std::string EnvLibrados::GenerateUniqueId() {
+  Random64 r(time(nullptr));
+  uint64_t random_uuid_portion =
+    r.Uniform(std::numeric_limits<uint64_t>::max());
+  uint64_t nanos_uuid_portion = NowNanos();
+  char uuid2[200];
+  snprintf(uuid2,
+           200,
+           "%16lx-%16lx",
+           (unsigned long)nanos_uuid_portion,
+           (unsigned long)random_uuid_portion);
+  return uuid2;
+}
+
+/**
+ * @brief create a new sequential read file handler
+ * @details it will check the existence of fname
+ *
+ * @param fname [description]
+ * @param result [description]
+ * @param options [description]
+ * @return [description]
+ */
+Status EnvLibrados::NewSequentialFile(
+  const std::string& fname,
+  std::unique_ptr<SequentialFile>* result,
+  const EnvOptions& options)
+{
+  LOG_DEBUG("[IN]%s\n", fname.c_str());
+  std::string dir, file, fid;
+  split(fname, &dir, &file);
+  Status s;
+  std::string fpath = dir + "/" + file;
+  do {
+    s = _GetFid(dir, fid);
+
+    if (!s.ok() || fid != DIR_ID_VALUE) {
+      if (fid != DIR_ID_VALUE) s = Status::IOError();
+      break;
+    }
+
+    s = _GetFid(fpath, fid);
+
+    if (Status::NotFound() == s) {
+      s = Status::IOError();
+      errno = ENOENT;
+      break;
+    }
+
+    result->reset(new LibradosSequentialFile(_GetIoctx(fpath), fid, fpath));
+    s = Status::OK();
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief create a new random access file handler
+ * @details it will check the existence of fname
+ *
+ * @param fname [description]
+ * @param result [description]
+ * @param options [description]
+ * @return [description]
+ */
+Status EnvLibrados::NewRandomAccessFile(
+  const std::string& fname,
+  std::unique_ptr<RandomAccessFile>* result,
+  const EnvOptions& options)
+{
+  LOG_DEBUG("[IN]%s\n", fname.c_str());
+  std::string dir, file, fid;
+  split(fname, &dir, &file);
+  Status s;
+  std::string fpath = dir + "/" + file;
+  do {
+    s = _GetFid(dir, fid);
+
+    if (!s.ok() || fid != DIR_ID_VALUE) {
+      s = Status::IOError();
+      break;
+    }
+
+    s = _GetFid(fpath, fid);
+
+    if (Status::NotFound() == s) {
+      s = Status::IOError();
+      errno = ENOENT;
+      break;
+    }
+
+    result->reset(new LibradosRandomAccessFile(_GetIoctx(fpath), fid, fpath));
+    s = Status::OK();
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief create a new write file handler
+ * @details it will check the existence of fname
+ *
+ * @param fname [description]
+ * @param result [description]
+ * @param options [description]
+ * @return [description]
+ */
+Status EnvLibrados::NewWritableFile(
+  const std::string& fname,
+  std::unique_ptr<WritableFile>* result,
+  const EnvOptions& options)
+{
+  LOG_DEBUG("[IN]%s\n", fname.c_str());
+  std::string dir, file, fid;
+  split(fname, &dir, &file);
+  Status s;
+  std::string fpath = dir + "/" + file;
+
+  do {
+    // 1. check if dir exist
+    s = _GetFid(dir, fid);
+    if (!s.ok()) {
+      break;
+    }
+
+    if (fid != DIR_ID_VALUE) {
+      s = Status::IOError();
+      break;
+    }
+
+    // 2. check if file exist.
+    // 2.1 exist, use it
+    // 2.2 not exist, create it
+    s = _GetFid(fpath, fid);
+    if (Status::NotFound() == s) {
+      fid = _CreateFid();
+      _AddFid(fpath, fid);
+    }
+
+    result->reset(new LibradosWritableFile(_GetIoctx(fpath), fid, fpath, this));
+    s = Status::OK();
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief reuse write file handler
+ * @details
+ *  This function will rename old_fname to new_fname,
+ *  then return the handler of new_fname
+ *
+ * @param new_fname [description]
+ * @param old_fname [description]
+ * @param result [description]
+ * @param options [description]
+ * @return [description]
+ */
+Status EnvLibrados::ReuseWritableFile(
+  const std::string& new_fname,
+  const std::string& old_fname,
+  std::unique_ptr<WritableFile>* result,
+  const EnvOptions& options)
+{
+  LOG_DEBUG("[IN]%s => %s\n", old_fname.c_str(), new_fname.c_str());
+  std::string src_fid, tmp_fid, src_dir, src_file, dst_dir, dst_file;
+  split(old_fname, &src_dir, &src_file);
+  split(new_fname, &dst_dir, &dst_file);
+
+  std::string src_fpath = src_dir + "/" + src_file;
+  std::string dst_fpath = dst_dir + "/" + dst_file;
+  Status r = Status::OK();
+  do {
+    r = _RenameFid(src_fpath,
+                   dst_fpath);
+    if (!r.ok()) {
+      break;
+    }
+
+    result->reset(new LibradosWritableFile(_GetIoctx(dst_fpath), src_fid, dst_fpath, this));
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", r.ToString().c_str());
+  return r;
+}
+
+/**
+ * @brief create a new directory handler
+ * @details [long description]
+ *
+ * @param name [description]
+ * @param result [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::NewDirectory(
+  const std::string& name,
+  std::unique_ptr<Directory>* result)
+{
+  LOG_DEBUG("[IN]%s\n", name.c_str());
+  std::string fid, dir, file;
+  /* just want to get dir name */
+  split(name + "/tmp", &dir, &file);
+  Status s;
+
+  do {
+    s = _GetFid(dir, fid);
+
+    if (!s.ok() || DIR_ID_VALUE != fid) {
+      s = Status::IOError(name, strerror(-ENOENT));
+      break;
+    }
+
+    if (Status::NotFound() == s) {
+      s = _AddFid(dir, DIR_ID_VALUE);
+      if (!s.ok()) break;
+    } else if (!s.ok()) {
+      break;
+    }
+
+    result->reset(new LibradosDirectory(_GetIoctx(dir), dir));
+    s = Status::OK();
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief check if fname is exist
+ * @details [long description]
+ *
+ * @param fname [description]
+ * @return [description]
+ */
+Status EnvLibrados::FileExists(const std::string& fname)
+{
+  LOG_DEBUG("[IN]%s\n", fname.c_str());
+  std::string fid, dir, file;
+  split(fname, &dir, &file);
+  Status s = _GetFid(dir + "/" + file, fid);
+
+  if (s.ok() && fid != DIR_ID_VALUE) {
+    s = Status::OK();
+  }
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief get subfile name of dir_in
+ * @details [long description]
+ *
+ * @param dir_in [description]
+ * @param result [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::GetChildren(
+  const std::string& dir_in,
+  std::vector<std::string>* result)
+{
+  LOG_DEBUG("[IN]%s\n", dir_in.c_str());
+  std::string fid, dir, file;
+  split(dir_in + "/temp", &dir, &file);
+  Status s;
+
+  do {
+    s = _GetFid(dir, fid);
+    if (!s.ok()) {
+      break;
+    }
+
+    if (fid != DIR_ID_VALUE) {
+      s = Status::IOError();
+      break;
+    }
+
+    s = _GetSubFnames(dir, result);
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief delete fname
+ * @details [long description]
+ *
+ * @param fname [description]
+ * @return [description]
+ */
+Status EnvLibrados::DeleteFile(const std::string& fname)
+{
+  LOG_DEBUG("[IN]%s\n", fname.c_str());
+  std::string fid, dir, file;
+  split(fname, &dir, &file);
+  Status s = _GetFid(dir + "/" + file, fid);
+
+  if (s.ok() && DIR_ID_VALUE != fid) {
+    s = _DelFid(dir + "/" + file);
+  } else {
+    s = Status::NotFound();
+  }
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief create new dir
+ * @details [long description]
+ *
+ * @param dirname [description]
+ * @return [description]
+ */
+Status EnvLibrados::CreateDir(const std::string& dirname)
+{
+  LOG_DEBUG("[IN]%s\n", dirname.c_str());
+  std::string fid, dir, file;
+  split(dirname + "/temp", &dir, &file);
+  Status s = _GetFid(dir + "/" + file, fid);
+
+  do {
+    if (Status::NotFound() != s && fid != DIR_ID_VALUE) {
+      break;
+    } else if (Status::OK() == s && fid == DIR_ID_VALUE) {
+      break;
+    }
+
+    s = _AddFid(dir, DIR_ID_VALUE);
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief create dir if missing
+ * @details [long description]
+ *
+ * @param dirname [description]
+ * @return [description]
+ */
+Status EnvLibrados::CreateDirIfMissing(const std::string& dirname)
+{
+  LOG_DEBUG("[IN]%s\n", dirname.c_str());
+  std::string fid, dir, file;
+  split(dirname + "/temp", &dir, &file);
+  Status s = Status::OK();
+
+  do {
+    s = _GetFid(dir, fid);
+    if (Status::NotFound() != s) {
+      break;
+    }
+
+    s = _AddFid(dir, DIR_ID_VALUE);
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief delete dir
+ * @details
+ *
+ * @param dirname [description]
+ * @return [description]
+ */
+Status EnvLibrados::DeleteDir(const std::string& dirname)
+{
+  LOG_DEBUG("[IN]%s\n", dirname.c_str());
+  std::string fid, dir, file;
+  split(dirname + "/temp", &dir, &file);
+  Status s = Status::OK();
+
+  s = _GetFid(dir, fid);
+
+  if (s.ok() && DIR_ID_VALUE == fid) {
+    std::vector<std::string> subs;
+    s = _GetSubFnames(dir, &subs);
+    // if subfiles exist, can't delete dir
+    if (subs.size() > 0) {
+      s = Status::IOError();
+    } else {
+      s = _DelFid(dir);
+    }
+  } else {
+    s = Status::NotFound();
+  }
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief return file size
+ * @details [long description]
+ *
+ * @param fname [description]
+ * @param file_size [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::GetFileSize(
+  const std::string& fname,
+  uint64_t* file_size)
+{
+  LOG_DEBUG("[IN]%s\n", fname.c_str());
+  std::string fid, dir, file;
+  split(fname, &dir, &file);
+  time_t mtime;
+  Status s;
+
+  do {
+    std::string fpath = dir + "/" + file;
+    s = _GetFid(fpath, fid);
+
+    if (!s.ok()) {
+      break;
+    }
+
+    int ret = _GetIoctx(fpath)->stat(fid, file_size, &mtime);
+    if (ret < 0) {
+      LOG_DEBUG("%i\n", ret);
+      if (-ENOENT == ret) {
+        *file_size = 0;
+        s = Status::OK();
+      } else {
+        s = err_to_status(ret);
+      }
+    } else {
+      s = Status::OK();
+    }
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s|%lld\n", s.ToString().c_str(), (long long)*file_size);
+  return s;
+}
+
+/**
+ * @brief get file modification time
+ * @details [long description]
+ *
+ * @param fname [description]
+ * @param file_mtime [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::GetFileModificationTime(const std::string& fname,
+    uint64_t* file_mtime)
+{
+  LOG_DEBUG("[IN]%s\n", fname.c_str());
+  std::string fid, dir, file;
+  split(fname, &dir, &file);
+  time_t mtime;
+  uint64_t file_size;
+  Status s = Status::OK();
+  do {
+    std::string fpath = dir + "/" + file;
+    s = _GetFid(dir + "/" + file, fid);
+
+    if (!s.ok()) {
+      break;
+    }
+
+    int ret = _GetIoctx(fpath)->stat(fid, &file_size, &mtime);
+    if (ret < 0) {
+      if (Status::NotFound() == err_to_status(ret)) {
+        *file_mtime = static_cast<uint64_t>(mtime);
+        s = Status::OK();
+      } else {
+        s = err_to_status(ret);
+      }
+    } else {
+      s = Status::OK();
+    }
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief rename file
+ * @details
+ *
+ * @param src [description]
+ * @param target_in [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::RenameFile(
+  const std::string& src,
+  const std::string& target_in)
+{
+  LOG_DEBUG("[IN]%s => %s\n", src.c_str(), target_in.c_str());
+  std::string src_fid, tmp_fid, src_dir, src_file, dst_dir, dst_file;
+  split(src, &src_dir, &src_file);
+  split(target_in, &dst_dir, &dst_file);
+
+  auto s = _RenameFid(src_dir + "/" + src_file,
+                      dst_dir + "/" + dst_file);
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief not support
+ * @details [long description]
+ *
+ * @param src [description]
+ * @param target_in [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::LinkFile(
+  const std::string& src,
+  const std::string& target_in)
+{
+  LOG_DEBUG("[IO]%s => %s\n", src.c_str(), target_in.c_str());
+  return Status::NotSupported();
+}
+
+/**
+ * @brief lock file. create if missing.
+ * @details [long description]
+ *
+ * It seems that LockFile is used for preventing other instance of RocksDB
+ * from opening up the database at the same time. From RocksDB source code,
+ * the invokes of LockFile are at following locations:
+ *
+ *  ./db/db_impl.cc:1159:    s = env_->LockFile(LockFileName(dbname_), &db_lock_);    // DBImpl::Recover
+ *  ./db/db_impl.cc:5839:  Status result = env->LockFile(lockname, &lock);            // Status DestroyDB
+ *
+ * When db recovery and db destroy, RocksDB will call LockFile
+ *
+ * @param fname [description]
+ * @param lock [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::LockFile(
+  const std::string& fname,
+  FileLock** lock)
+{
+  LOG_DEBUG("[IN]%s\n", fname.c_str());
+  std::string fid, dir, file;
+  split(fname, &dir, &file);
+  Status s = Status::OK();
+
+  do {
+    std::string fpath = dir + "/" + file;
+    s = _GetFid(fpath, fid);
+
+    if (Status::OK() != s &&
+        Status::NotFound() != s) {
+      break;
+    } else if (Status::NotFound() == s) {
+      s = _AddFid(fpath, _CreateFid());
+      if (!s.ok()) {
+        break;
+      }
+    } else if (Status::OK() == s && DIR_ID_VALUE == fid) {
+      s = Status::IOError();
+      break;
+    }
+
+    *lock = new LibradosFileLock(_GetIoctx(fpath), fpath);
+  } while (0);
+
+  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
+  return s;
+}
+
+/**
+ * @brief unlock file
+ * @details [long description]
+ *
+ * @param lock [description]
+ * @return [description]
+ */
+Status EnvLibrados::UnlockFile(FileLock* lock)
+{
+  LOG_DEBUG("[IO]%p\n", lock);
+  if (nullptr != lock) {
+    delete lock;
+  }
+  return Status::OK();
+}
+
+
+/**
+ * @brief not support
+ * @details [long description]
+ *
+ * @param db_path [description]
+ * @param output_path [description]
+ *
+ * @return [description]
+ */
+Status EnvLibrados::GetAbsolutePath(
+  const std::string& db_path,
+  std::string* output_path)
+{
+  LOG_DEBUG("[IO]%s\n", db_path.c_str());
+  return Status::NotSupported();
+}
+
+/**
+ * @brief Get default EnvLibrados
+ * @details [long description]
+ * @return [description]
+ */
+EnvLibrados* EnvLibrados::Default() {
+  static EnvLibrados default_env(default_db_name,
+                                 std::getenv(default_config_path),
+                                 default_pool_name);
+  return &default_env;
+}
+// @lint-ignore TXT4 T25377293 Grandfathered in
+}
\ No newline at end of file
-- 
cgit v1.2.3