2 files changed, 1076 insertions, 0 deletions
diff --git a/src/blk/spdk/NVMEDevice.cc b/src/blk/spdk/NVMEDevice.cc
new file mode 100644
index 000000000..4461f6a07
--- /dev/null
+++ b/src/blk/spdk/NVMEDevice.cc
@@ -0,0 +1,992 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+//
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+  *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <chrono>
+#include <fstream>
+#include <functional>
+#include <map>
+#include <thread>
+#include <boost/intrusive/slist.hpp>
+
+#include <spdk/nvme.h>
+
+#include "include/intarith.h"
+#include "include/stringify.h"
+#include "include/types.h"
+#include "include/compat.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "common/perf_counters.h"
+
+#include "NVMEDevice.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_bdev
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev(" << sn << ") "
+
+using namespace std;
+
+static constexpr uint16_t data_buffer_default_num = 1024;
+
+static constexpr uint32_t data_buffer_size = 8192;
+
+static constexpr uint16_t inline_segment_num = 32;
+
+/* Default to 10 seconds for the keep alive value. This value is arbitrary. */
+static constexpr uint32_t nvme_ctrlr_keep_alive_timeout_in_ms = 10000;
+
+static void io_complete(void *t, const struct spdk_nvme_cpl *completion);
+
+struct IORequest {
+  uint16_t cur_seg_idx = 0;
+  uint16_t nseg;
+  uint32_t cur_seg_left = 0;
+  void *inline_segs[inline_segment_num];
+  void **extra_segs = nullptr;
+};
+
+namespace bi = boost::intrusive;
+struct data_cache_buf : public bi::slist_base_hook<bi::link_mode<bi::normal_link>>
+{};
+
+struct Task;
+
+class SharedDriverData {
+  unsigned id;
+  spdk_nvme_transport_id trid;
+  spdk_nvme_ctrlr *ctrlr;
+  spdk_nvme_ns *ns;
+  uint32_t block_size = 0;
+  uint64_t size = 0;
+  std::thread admin_thread;
+
+  public:
+  std::vector<NVMEDevice*> registered_devices;
+  friend class SharedDriverQueueData;
+  SharedDriverData(unsigned id_, const spdk_nvme_transport_id& trid_,
+                   spdk_nvme_ctrlr *c, spdk_nvme_ns *ns_)
+      : id(id_),
+        trid(trid_),
+        ctrlr(c),
+        ns(ns_) {
+    block_size = spdk_nvme_ns_get_extended_sector_size(ns);
+    size = spdk_nvme_ns_get_size(ns);
+    if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+      return;
+    }
+
+    // For Non-PCIe transport, we need to send keep-alive periodically.
+    admin_thread = std::thread(
+      [this]() {
+	int rc;
+        while (true) {
+	  rc = spdk_nvme_ctrlr_process_admin_completions(ctrlr);
+          ceph_assert(rc >= 0);
+          sleep(1);
+        }
+      }
+    );
+  }
+
+  bool is_equal(const spdk_nvme_transport_id& trid2) const {
+    return spdk_nvme_transport_id_compare(&trid, &trid2) == 0;
+  }
+  ~SharedDriverData() {
+    if (admin_thread.joinable()) {
+      admin_thread.join();
+    }
+  }
+
+  void register_device(NVMEDevice *device) {
+    registered_devices.push_back(device);
+  }
+
+  void remove_device(NVMEDevice *device) {
+    std::vector<NVMEDevice*> new_devices;
+    for (auto &&it : registered_devices) {
+      if (it != device)
+        new_devices.push_back(it);
+    }
+    registered_devices.swap(new_devices);
+  }
+
+  uint32_t get_block_size() {
+    return block_size;
+  }
+  uint64_t get_size() {
+    return size;
+  }
+};
+
+class SharedDriverQueueData {
+  NVMEDevice *bdev;
+  SharedDriverData *driver;
+  spdk_nvme_ctrlr *ctrlr;
+  spdk_nvme_ns *ns;
+  std::string sn;
+  uint32_t block_size;
+  uint32_t max_queue_depth;
+  struct spdk_nvme_qpair *qpair;
+  int alloc_buf_from_pool(Task *t, bool write);
+
+  public:
+    uint32_t current_queue_depth = 0;
+    std::atomic_ulong completed_op_seq, queue_op_seq;
+    bi::slist<data_cache_buf, bi::constant_time_size<true>> data_buf_list;
+    void _aio_handle(Task *t, IOContext *ioc);
+
+    SharedDriverQueueData(NVMEDevice *bdev, SharedDriverData *driver)
+      : bdev(bdev),
+        driver(driver) {
+    ctrlr = driver->ctrlr;
+    ns = driver->ns;
+    block_size = driver->block_size;
+
+    struct spdk_nvme_io_qpair_opts opts = {};
+    spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
+    opts.qprio = SPDK_NVME_QPRIO_URGENT;
+    // usable queue depth should minus 1 to avoid overflow.
+    max_queue_depth = opts.io_queue_size - 1;
+    qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
+    ceph_assert(qpair != NULL);
+
+    // allocate spdk dma memory
+    for (uint16_t i = 0; i < data_buffer_default_num; i++) {
+      void *b = spdk_dma_zmalloc(data_buffer_size, CEPH_PAGE_SIZE, NULL);
+      if (!b) {
+        derr << __func__ << " failed to create memory pool for nvme data buffer" << dendl;
+        ceph_assert(b);
+      }
+      data_buf_list.push_front(*reinterpret_cast<data_cache_buf *>(b));
+    }
+  }
+
+  ~SharedDriverQueueData() {
+    if (qpair) {
+      spdk_nvme_ctrlr_free_io_qpair(qpair);
+    }
+
+    data_buf_list.clear_and_dispose(spdk_dma_free);
+  }
+};
+
+struct Task {
+  NVMEDevice *device;
+  IOContext *ctx = nullptr;
+  IOCommand command;
+  uint64_t offset;
+  uint64_t len;
+  bufferlist bl;
+  std::function<void()> fill_cb;
+  Task *next = nullptr;
+  int64_t return_code;
+  Task *primary = nullptr;
+  IORequest io_request = {};
+  SharedDriverQueueData *queue = nullptr;
+  // reference count by subtasks.
+  int ref = 0;
+  Task(NVMEDevice *dev, IOCommand c, uint64_t off, uint64_t l, int64_t rc = 0,
+       Task *p = nullptr)
+    : device(dev), command(c), offset(off), len(l),
+      return_code(rc), primary(p) {
+        if (primary) {
+          primary->ref++;
+          return_code = primary->return_code;
+        }
+     }
+  ~Task() {
+    if (primary)
+      primary->ref--;
+    ceph_assert(!io_request.nseg);
+  }
+  void release_segs(SharedDriverQueueData *queue_data) {
+    if (io_request.extra_segs) {
+      for (uint16_t i = 0; i < io_request.nseg; i++) {
+        auto buf = reinterpret_cast<data_cache_buf *>(io_request.extra_segs[i]);
+        queue_data->data_buf_list.push_front(*buf);
+      }
+      delete io_request.extra_segs;
+    } else if (io_request.nseg) {
+      for (uint16_t i = 0; i < io_request.nseg; i++) {
+        auto buf = reinterpret_cast<data_cache_buf *>(io_request.inline_segs[i]);
+        queue_data->data_buf_list.push_front(*buf);
+      }
+    }
+    ctx->total_nseg -= io_request.nseg;
+    io_request.nseg = 0;
+  }
+
+  void copy_to_buf(char *buf, uint64_t off, uint64_t len) {
+    uint64_t copied = 0;
+    uint64_t left = len;
+    void **segs = io_request.extra_segs ? io_request.extra_segs : io_request.inline_segs;
+    uint16_t i = 0;
+    while (left > 0) {
+      char *src = static_cast<char*>(segs[i++]);
+      uint64_t need_copy = std::min(left, data_buffer_size-off);
+      memcpy(buf+copied, src+off, need_copy);
+      off = 0;
+      left -= need_copy;
+      copied += need_copy;
+    }
+  }
+};
+
+static void data_buf_reset_sgl(void *cb_arg, uint32_t sgl_offset)
+{
+  Task *t = static_cast<Task*>(cb_arg);
+  uint32_t i = sgl_offset / data_buffer_size;
+  uint32_t offset = i * data_buffer_size;
+  ceph_assert(i <= t->io_request.nseg);
+
+  for (; i < t->io_request.nseg; i++) {
+    offset += data_buffer_size;
+    if (offset > sgl_offset) {
+      if (offset > t->len)
+        offset = t->len;
+      break;
+    }
+  }
+
+  t->io_request.cur_seg_idx = i;
+  t->io_request.cur_seg_left = offset - sgl_offset;
+  return ;
+}
+
+static int data_buf_next_sge(void *cb_arg, void **address, uint32_t *length)
+{
+  uint32_t size;
+  void *addr;
+  Task *t = static_cast<Task*>(cb_arg);
+  if (t->io_request.cur_seg_idx >= t->io_request.nseg) {
+    *length = 0;
+    *address = 0;
+    return 0;
+  }
+
+  addr = t->io_request.extra_segs ? t->io_request.extra_segs[t->io_request.cur_seg_idx] : t->io_request.inline_segs[t->io_request.cur_seg_idx];
+
+  size = data_buffer_size;
+  if (t->io_request.cur_seg_idx == t->io_request.nseg - 1) {
+      uint64_t tail = t->len % data_buffer_size;
+      if (tail) {
+        size = (uint32_t) tail;
+      }
+  }
+ 
+  if (t->io_request.cur_seg_left) {
+    *address = (void *)((uint64_t)addr + size - t->io_request.cur_seg_left);
+    *length = t->io_request.cur_seg_left;
+    t->io_request.cur_seg_left = 0;
+  } else {
+    *address = addr;
+    *length = size;
+  }
+  
+  t->io_request.cur_seg_idx++;
+  return 0;
+}
+
+int SharedDriverQueueData::alloc_buf_from_pool(Task *t, bool write)
+{
+  uint64_t count = t->len / data_buffer_size;
+  if (t->len % data_buffer_size)
+    ++count;
+  void **segs;
+  if (count > data_buf_list.size())
+    return -ENOMEM;
+  if (count <= inline_segment_num) {
+    segs = t->io_request.inline_segs;
+  } else {
+    t->io_request.extra_segs = new void*[count];
+    segs = t->io_request.extra_segs;
+  }
+  for (uint16_t i = 0; i < count; i++) {
+    ceph_assert(!data_buf_list.empty());
+    segs[i] = &data_buf_list.front();
+    ceph_assert(segs[i] != nullptr);
+    data_buf_list.pop_front();
+  }
+  t->io_request.nseg = count;
+  t->ctx->total_nseg += count;
+  if (write) {
+    auto blp = t->bl.begin();
+    uint32_t len = 0;
+    uint16_t i = 0;
+    for (; i < count - 1; ++i) {
+      blp.copy(data_buffer_size, static_cast<char*>(segs[i]));
+      len += data_buffer_size;
+    }
+    blp.copy(t->bl.length() - len, static_cast<char*>(segs[i]));
+  }
+
+  return 0;
+}
+
+void SharedDriverQueueData::_aio_handle(Task *t, IOContext *ioc)
+{
+  dout(20) << __func__ << " start" << dendl;
+
+  int r = 0;
+  uint64_t lba_off, lba_count;
+  uint32_t max_io_completion = (uint32_t)g_conf().get_val<uint64_t>("bluestore_spdk_max_io_completion");
+  uint64_t io_sleep_in_us = g_conf().get_val<uint64_t>("bluestore_spdk_io_sleep");
+
+  while (ioc->num_running) {
+ again:
+    dout(40) << __func__ << " polling" << dendl;
+    if (current_queue_depth) {
+      r = spdk_nvme_qpair_process_completions(qpair, max_io_completion);
+      if (r < 0) {
+        ceph_abort();
+      } else if (r == 0) {
+        usleep(io_sleep_in_us);
+      }
+    }
+
+    for (; t; t = t->next) {
+      if (current_queue_depth == max_queue_depth) {
+        // no slots
+        goto again;
+      }
+
+      t->queue = this;
+      lba_off = t->offset / block_size;
+      lba_count = t->len / block_size;
+      switch (t->command) {
+        case IOCommand::WRITE_COMMAND:
+        {
+          dout(20) << __func__ << " write command issued " << lba_off << "~" << lba_count << dendl;
+          r = alloc_buf_from_pool(t, true);
+          if (r < 0) {
+            goto again;
+          }
+
+          r = spdk_nvme_ns_cmd_writev(
+              ns, qpair, lba_off, lba_count, io_complete, t, 0,
+              data_buf_reset_sgl, data_buf_next_sge);
+          if (r < 0) {
+            derr << __func__ << " failed to do write command: " << cpp_strerror(r) << dendl;
+            t->ctx->nvme_task_first = t->ctx->nvme_task_last = nullptr;
+            t->release_segs(this);
+            delete t;
+            ceph_abort();
+          }
+          break;
+        }
+        case IOCommand::READ_COMMAND:
+        {
+          dout(20) << __func__ << " read command issued " << lba_off << "~" << lba_count << dendl;
+          r = alloc_buf_from_pool(t, false);
+          if (r < 0) {
+            goto again;
+          }
+
+          r = spdk_nvme_ns_cmd_readv(
+              ns, qpair, lba_off, lba_count, io_complete, t, 0,
+              data_buf_reset_sgl, data_buf_next_sge);
+          if (r < 0) {
+            derr << __func__ << " failed to read: " << cpp_strerror(r) << dendl;
+            t->release_segs(this);
+            delete t;
+            ceph_abort();
+          }
+          break;
+        }
+        case IOCommand::FLUSH_COMMAND:
+        {
+          dout(20) << __func__ << " flush command issueed " << dendl;
+          r = spdk_nvme_ns_cmd_flush(ns, qpair, io_complete, t);
+          if (r < 0) {
+            derr << __func__ << " failed to flush: " << cpp_strerror(r) << dendl;
+            t->release_segs(this);
+            delete t;
+            ceph_abort();
+          }
+          break;
+        }
+      }
+      current_queue_depth++;
+    }
+  }
+
+  dout(20) << __func__ << " end" << dendl;
+}
+
+#define dout_subsys ceph_subsys_bdev
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev "
+
+class NVMEManager {
+ public:
+  struct ProbeContext {
+    spdk_nvme_transport_id trid;
+    NVMEManager *manager;
+    SharedDriverData *driver;
+    bool done;
+  };
+
+ private:
+  ceph::mutex lock = ceph::make_mutex("NVMEManager::lock");
+  bool stopping = false;
+  std::vector<SharedDriverData*> shared_driver_datas;
+  std::thread dpdk_thread;
+  ceph::mutex probe_queue_lock = ceph::make_mutex("NVMEManager::probe_queue_lock");
+  ceph::condition_variable probe_queue_cond;
+  std::list<ProbeContext*> probe_queue;
+
+ public:
+  NVMEManager() {}
+  ~NVMEManager() {
+    if (!dpdk_thread.joinable())
+      return;
+    {
+      std::lock_guard guard(probe_queue_lock);
+      stopping = true;
+      probe_queue_cond.notify_all();
+    }
+    dpdk_thread.join();
+  }
+
+  int try_get(const spdk_nvme_transport_id& trid, SharedDriverData **driver);
+  void register_ctrlr(const spdk_nvme_transport_id& trid, spdk_nvme_ctrlr *c, SharedDriverData **driver) {
+    ceph_assert(ceph_mutex_is_locked(lock));
+    spdk_nvme_ns *ns;
+    int num_ns = spdk_nvme_ctrlr_get_num_ns(c);
+    ceph_assert(num_ns >= 1);
+    if (num_ns > 1) {
+      dout(0) << __func__ << " namespace count larger than 1, currently only use the first namespace" << dendl;
+    }
+    ns = spdk_nvme_ctrlr_get_ns(c, 1);
+    if (!ns) {
+      derr << __func__ << " failed to get namespace at 1" << dendl;
+      ceph_abort();
+    }
+    dout(1) << __func__ << " successfully attach nvme device at" << trid.traddr << dendl;
+
+    // only support one device per osd now!
+    ceph_assert(shared_driver_datas.empty());
+    // index 0 is occurred by master thread
+    shared_driver_datas.push_back(new SharedDriverData(shared_driver_datas.size()+1, trid, c, ns));
+    *driver = shared_driver_datas.back();
+  }
+};
+
+static NVMEManager manager;
+
+static bool probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, struct spdk_nvme_ctrlr_opts *opts)
+{
+  NVMEManager::ProbeContext *ctx = static_cast<NVMEManager::ProbeContext*>(cb_ctx);
+  bool do_attach = false;
+
+  if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
+    do_attach = spdk_nvme_transport_id_compare(&ctx->trid, trid) == 0;
+    if (!do_attach) {
+      dout(0) << __func__ << " device traddr (" << ctx->trid.traddr
+              << ") not match " << trid->traddr << dendl;
+    }
+  } else {
+    // for non-pcie devices, should always match the specified trid
+    assert(!spdk_nvme_transport_id_compare(&ctx->trid, trid));
+    do_attach = true;
+  }
+
+  if (do_attach) {
+    dout(0) << __func__ << " found device at: "
+	    << "trtype=" << spdk_nvme_transport_id_trtype_str(trid->trtype) << ", "
+	    << "traddr=" << trid->traddr << dendl;
+
+    opts->io_queue_size = UINT16_MAX;
+    opts->io_queue_requests = UINT16_MAX;
+    opts->keep_alive_timeout_ms = nvme_ctrlr_keep_alive_timeout_in_ms;
+  }
+
+  return do_attach;
+}
+
+static void attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+                      struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
+{
+  auto ctx = static_cast<NVMEManager::ProbeContext*>(cb_ctx);
+  ctx->manager->register_ctrlr(ctx->trid, ctrlr, &ctx->driver);
+}
+
+static int hex2dec(unsigned char c)
+{
+  if (isdigit(c))
+    return c - '0';
+  else if (isupper(c))
+    return c - 'A' + 10;
+  else
+    return c - 'a' + 10;
+}
+
+static int find_first_bitset(const string& s)
+{
+  auto e = s.rend();
+  if (s.compare(0, 2, "0x") == 0 ||
+      s.compare(0, 2, "0X") == 0) {
+    advance(e, -2);
+  }
+  auto p = s.rbegin();
+  for (int pos = 0; p != e; ++p, pos += 4) {
+    if (!isxdigit(*p)) {
+      return -EINVAL;
+    }
+    if (int val = hex2dec(*p); val != 0) {
+      return pos + ffs(val);
+    }
+  }
+  return 0;
+}
+
+int NVMEManager::try_get(const spdk_nvme_transport_id& trid, SharedDriverData **driver)
+{
+  std::lock_guard l(lock);
+  for (auto &&it : shared_driver_datas) {
+    if (it->is_equal(trid)) {
+      *driver = it;
+      return 0;
+    }
+  }
+
+  auto coremask_arg = g_conf().get_val<std::string>("bluestore_spdk_coremask");
+  int m_core_arg = find_first_bitset(coremask_arg);
+  // at least one core is needed for using spdk
+  if (m_core_arg <= 0) {
+    derr << __func__ << " invalid bluestore_spdk_coremask, "
+	 << "at least one core is needed" << dendl;
+    return -ENOENT;
+  }
+  m_core_arg -= 1;
+
+  uint32_t mem_size_arg = (uint32_t)g_conf().get_val<Option::size_t>("bluestore_spdk_mem");
+
+  if (!dpdk_thread.joinable()) {
+    dpdk_thread = std::thread(
+      [this, coremask_arg, m_core_arg, mem_size_arg, trid]() {
+        struct spdk_env_opts opts;
+        struct spdk_pci_addr addr;
+        int r;
+
+        bool local_pci_device = false;
+        int rc = spdk_pci_addr_parse(&addr, trid.traddr);
+        if (!rc) {
+          local_pci_device = true;
+          opts.pci_whitelist = &addr;
+          opts.num_pci_addr = 1;
+        }
+
+	spdk_env_opts_init(&opts);
+        opts.name = "nvme-device-manager";
+        opts.core_mask = coremask_arg.c_str();
+        opts.master_core = m_core_arg;
+        opts.mem_size = mem_size_arg;
+        spdk_env_init(&opts);
+        spdk_unaffinitize_thread();
+
+        std::unique_lock l(probe_queue_lock);
+        while (!stopping) {
+          if (!probe_queue.empty()) {
+            ProbeContext* ctxt = probe_queue.front();
+            probe_queue.pop_front();
+            r = spdk_nvme_probe(local_pci_device ? NULL : &trid, ctxt, probe_cb, attach_cb, NULL);
+            if (r < 0) {
+              ceph_assert(!ctxt->driver);
+              derr << __func__ << " device probe nvme failed" << dendl;
+            }
+            ctxt->done = true;
+            probe_queue_cond.notify_all();
+          } else {
+            probe_queue_cond.wait(l);
+          }
+        }
+        for (auto p : probe_queue)
+          p->done = true;
+        probe_queue_cond.notify_all();
+      }
+    );
+  }
+
+  ProbeContext ctx{trid, this, nullptr, false};
+  {
+    std::unique_lock l(probe_queue_lock);
+    probe_queue.push_back(&ctx);
+    while (!ctx.done)
+      probe_queue_cond.wait(l);
+  }
+  if (!ctx.driver)
+    return -1;
+  *driver = ctx.driver;
+
+  return 0;
+}
+
+void io_complete(void *t, const struct spdk_nvme_cpl *completion)
+{
+  Task *task = static_cast<Task*>(t);
+  IOContext *ctx = task->ctx;
+  SharedDriverQueueData *queue = task->queue;
+
+  ceph_assert(queue != NULL);
+  ceph_assert(ctx != NULL);
+  --queue->current_queue_depth;
+  if (task->command == IOCommand::WRITE_COMMAND) {
+    ceph_assert(!spdk_nvme_cpl_is_error(completion));
+    dout(20) << __func__ << " write/zero op successfully, left "
+             << queue->queue_op_seq - queue->completed_op_seq << dendl;
+    // check waiting count before doing callback (which may
+    // destroy this ioc).
+    if (ctx->priv) {
+      if (!--ctx->num_running) {
+        task->device->aio_callback(task->device->aio_callback_priv, ctx->priv);
+      }
+    } else {
+      ctx->try_aio_wake();
+    }
+    task->release_segs(queue);
+    delete task;
+  } else if (task->command == IOCommand::READ_COMMAND) {
+    ceph_assert(!spdk_nvme_cpl_is_error(completion));
+    dout(20) << __func__ << " read op successfully" << dendl;
+    task->fill_cb();
+    task->release_segs(queue);
+    // read submitted by AIO
+    if (!task->return_code) {
+      if (ctx->priv) {
+	if (!--ctx->num_running) {
+          task->device->aio_callback(task->device->aio_callback_priv, ctx->priv);
+	}
+      } else {
+        ctx->try_aio_wake();
+      }
+      delete task;
+    } else {
+      if (Task* primary = task->primary; primary != nullptr) {
+        delete task;
+        if (!primary->ref)
+          primary->return_code = 0;
+      } else {
+	  task->return_code = 0;
+      }
+      --ctx->num_running;
+    }
+  } else {
+    ceph_assert(task->command == IOCommand::FLUSH_COMMAND);
+    ceph_assert(!spdk_nvme_cpl_is_error(completion));
+    dout(20) << __func__ << " flush op successfully" << dendl;
+    task->return_code = 0;
+  }
+}
+
+// ----------------
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev(" << name << ") "
+
+NVMEDevice::NVMEDevice(CephContext* cct, aio_callback_t cb, void *cbpriv)
+  :   BlockDevice(cct, cb, cbpriv),
+      driver(nullptr)
+{
+}
+
+bool NVMEDevice::support(const std::string& path)
+{
+  char buf[PATH_MAX + 1];
+  int r = ::readlink(path.c_str(), buf, sizeof(buf) - 1);
+  if (r >= 0) {
+    buf[r] = '\0';
+    char *bname = ::basename(buf);
+    if (strncmp(bname, SPDK_PREFIX, sizeof(SPDK_PREFIX)-1) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+int NVMEDevice::open(const string& p)
+{
+  dout(1) << __func__ << " path " << p << dendl;
+
+  std::ifstream ifs(p);
+  if (!ifs) {
+    derr << __func__ << " unable to open " << p << dendl;
+    return -1;
+  }
+  string val;
+  std::getline(ifs, val);
+  spdk_nvme_transport_id trid;
+  if (int r = spdk_nvme_transport_id_parse(&trid, val.c_str()); r) {
+    derr << __func__ << " unable to read " << p << ": " << cpp_strerror(r)
+	 << dendl;
+    return r;
+  }
+  if (int r = manager.try_get(trid, &driver); r < 0) {
+    derr << __func__ << " failed to get nvme device with transport address "
+                      << trid.traddr << " type " << trid.trtype << dendl;
+    return r;
+  }
+
+  driver->register_device(this);
+  block_size = driver->get_block_size();
+  size = driver->get_size();
+  name = trid.traddr;
+
+  //nvme is non-rotational device.
+  rotational = false;
+
+  // round size down to an even block
+  size &= ~(block_size - 1);
+
+  dout(1) << __func__ << " size " << size << " (" << byte_u_t(size) << ")"
+          << " block_size " << block_size << " (" << byte_u_t(block_size)
+          << ")" << dendl;
+
+
+  return 0;
+}
+
+void NVMEDevice::close()
+{
+  dout(1) << __func__ << dendl;
+
+  name.clear();
+  driver->remove_device(this);
+
+  dout(1) << __func__ << " end" << dendl;
+}
+
+int NVMEDevice::collect_metadata(const string& prefix, map<string,string> *pm) const
+{
+  (*pm)[prefix + "rotational"] = "0";
+  (*pm)[prefix + "size"] = stringify(get_size());
+  (*pm)[prefix + "block_size"] = stringify(get_block_size());
+  (*pm)[prefix + "driver"] = "NVMEDevice";
+  (*pm)[prefix + "type"] = "nvme";
+  (*pm)[prefix + "access_mode"] = "spdk";
+  (*pm)[prefix + "nvme_serial_number"] = name;
+
+  return 0;
+}
+
+int NVMEDevice::flush()
+{
+  return 0;
+}
+
+void NVMEDevice::aio_submit(IOContext *ioc)
+{
+  dout(20) << __func__ << " ioc " << ioc << " pending "
+           << ioc->num_pending.load() << " running "
+           << ioc->num_running.load() << dendl;
+  int pending = ioc->num_pending.load();
+  Task *t = static_cast<Task*>(ioc->nvme_task_first);
+  if (pending && t) {
+    ioc->num_running += pending;
+    ioc->num_pending -= pending;
+    ceph_assert(ioc->num_pending.load() == 0);  // we should be only thread doing this
+    // Only need to push the first entry
+    ioc->nvme_task_first = ioc->nvme_task_last = nullptr;
+
+    thread_local SharedDriverQueueData queue_t = SharedDriverQueueData(this, driver);
+    queue_t._aio_handle(t, ioc);
+  }
+}
+
+static void ioc_append_task(IOContext *ioc, Task *t)
+{
+  Task *first, *last;
+
+  first = static_cast<Task*>(ioc->nvme_task_first);
+  last = static_cast<Task*>(ioc->nvme_task_last);
+  if (last)
+    last->next = t;
+  if (!first)
+    ioc->nvme_task_first = t;
+  ioc->nvme_task_last = t;
+  ++ioc->num_pending;
+}
+
+static void write_split(
+    NVMEDevice *dev,
+    uint64_t off,
+    bufferlist &bl,
+    IOContext *ioc)
+{
+  uint64_t remain_len = bl.length(), begin = 0, write_size;
+  Task *t;
+  // This value may need to be got from configuration later.
+  uint64_t split_size = 131072; // 128KB.
+
+  while (remain_len > 0) {
+    write_size = std::min(remain_len, split_size);
+    t = new Task(dev, IOCommand::WRITE_COMMAND, off + begin, write_size);
+    // TODO: if upper layer alloc memory with known physical address,
+    // we can reduce this copy
+    bl.splice(0, write_size, &t->bl);
+    remain_len -= write_size;
+    t->ctx = ioc;
+    ioc_append_task(ioc, t);
+    begin += write_size;
+  }
+}
+
+static void make_read_tasks(
+    NVMEDevice *dev,
+    uint64_t aligned_off,
+    IOContext *ioc, char *buf, uint64_t aligned_len, Task *primary,
+    uint64_t orig_off, uint64_t orig_len)
+{
+  // This value may need to be got from configuration later.
+  uint64_t split_size = 131072; // 128KB.
+  uint64_t tmp_off = orig_off - aligned_off, remain_orig_len = orig_len;
+  auto begin = aligned_off;
+  const auto aligned_end = begin + aligned_len;
+
+  for (; begin < aligned_end; begin += split_size) {
+    auto read_size = std::min(aligned_end - begin, split_size);
+    auto tmp_len = std::min(remain_orig_len, read_size - tmp_off);
+    Task *t = nullptr;
+
+    if (primary && (aligned_len <= split_size)) {
+      t = primary;
+    } else {
+      t = new Task(dev, IOCommand::READ_COMMAND, begin, read_size, 0, primary);
+    }
+
+    t->ctx = ioc;
+
+    // TODO: if upper layer alloc memory with known physical address,
+    // we can reduce this copy
+    t->fill_cb = [buf, t, tmp_off, tmp_len]  {
+      t->copy_to_buf(buf, tmp_off, tmp_len);
+    };
+
+    ioc_append_task(ioc, t);
+    remain_orig_len -= tmp_len;
+    buf += tmp_len;
+    tmp_off = 0;
+  }
+}
+
+int NVMEDevice::aio_write(
+    uint64_t off,
+    bufferlist &bl,
+    IOContext *ioc,
+    bool buffered,
+    int write_hint)
+{
+  uint64_t len = bl.length();
+  dout(20) << __func__ << " " << off << "~" << len << " ioc " << ioc
+           << " buffered " << buffered << dendl;
+  ceph_assert(is_valid_io(off, len));
+
+  write_split(this, off, bl, ioc);
+  dout(5) << __func__ << " " << off << "~" << len << dendl;
+
+  return 0;
+}
+
+int NVMEDevice::write(uint64_t off, bufferlist &bl, bool buffered, int write_hint)
+{
+  uint64_t len = bl.length();
+  dout(20) << __func__ << " " << off << "~" << len << " buffered "
+           << buffered << dendl;
+  ceph_assert(off % block_size == 0);
+  ceph_assert(len % block_size == 0);
+  ceph_assert(len > 0);
+  ceph_assert(off < size);
+  ceph_assert(off + len <= size);
+
+  IOContext ioc(cct, NULL);
+  write_split(this, off, bl, &ioc);
+  dout(5) << __func__ << " " << off << "~" << len << dendl;
+  aio_submit(&ioc);
+  ioc.aio_wait();
+  return 0;
+}
+
+int NVMEDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
+                     IOContext *ioc,
+                     bool buffered)
+{
+  dout(5) << __func__ << " " << off << "~" << len << " ioc " << ioc << dendl;
+  ceph_assert(is_valid_io(off, len));
+
+  Task t(this, IOCommand::READ_COMMAND, off, len, 1);
+  bufferptr p = buffer::create_small_page_aligned(len);
+  char *buf = p.c_str();
+
+  // for sync read, need to control IOContext in itself
+  IOContext read_ioc(cct, nullptr);
+  make_read_tasks(this, off, &read_ioc, buf, len, &t, off, len);
+  dout(5) << __func__ << " " << off << "~" << len << dendl;
+  aio_submit(&read_ioc);
+
+  pbl->push_back(std::move(p));
+  return t.return_code;
+}
+
+int NVMEDevice::aio_read(
+    uint64_t off,
+    uint64_t len,
+    bufferlist *pbl,
+    IOContext *ioc)
+{
+  dout(20) << __func__ << " " << off << "~" << len << " ioc " << ioc << dendl;
+  ceph_assert(is_valid_io(off, len));
+  bufferptr p = buffer::create_small_page_aligned(len);
+  pbl->append(p);
+  char* buf = p.c_str();
+
+  make_read_tasks(this, off, ioc, buf, len, NULL, off, len);
+  dout(5) << __func__ << " " << off << "~" << len << dendl;
+  return 0;
+}
+
+int NVMEDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered)
+{
+  ceph_assert(len > 0);
+  ceph_assert(off < size);
+  ceph_assert(off + len <= size);
+
+  uint64_t aligned_off = p2align(off, block_size);
+  uint64_t aligned_len = p2roundup(off+len, block_size) - aligned_off;
+  dout(5) << __func__ << " " << off << "~" << len
+          << " aligned " << aligned_off << "~" << aligned_len << dendl;
+  IOContext ioc(g_ceph_context, nullptr);
+  Task t(this, IOCommand::READ_COMMAND, aligned_off, aligned_len, 1);
+
+  make_read_tasks(this, aligned_off, &ioc, buf, aligned_len, &t, off, len);
+  aio_submit(&ioc);
+
+  return t.return_code;
+}
+
+int NVMEDevice::invalidate_cache(uint64_t off, uint64_t len)
+{
+  dout(5) << __func__ << " " << off << "~" << len << dendl;
+  return 0;
+}
diff --git a/src/blk/spdk/NVMEDevice.h b/src/blk/spdk/NVMEDevice.h
new file mode 100644
index 000000000..323d7281f
--- /dev/null
+++ b/src/blk/spdk/NVMEDevice.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+  *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BLK_NVMEDEVICE
+#define CEPH_BLK_NVMEDEVICE
+
+#include <queue>
+#include <map>
+#include <limits>
+
+// since _Static_assert introduced in c11
+#define _Static_assert static_assert
+
+
+#include "include/interval_set.h"
+#include "common/ceph_time.h"
+#include "BlockDevice.h"
+
+enum class IOCommand {
+  READ_COMMAND,
+  WRITE_COMMAND,
+  FLUSH_COMMAND
+};
+
+class SharedDriverData;
+class SharedDriverQueueData;
+
+class NVMEDevice : public BlockDevice {
+  /**
+   * points to pinned, physically contiguous memory region;
+   * contains 4KB IDENTIFY structure for controller which is
+   * target for CONTROLLER IDENTIFY command during initialization
+   */
+  SharedDriverData *driver;
+  std::string name;
+
+ public:
+  SharedDriverData *get_driver() { return driver; }
+
+  NVMEDevice(CephContext* cct, aio_callback_t cb, void *cbpriv);
+
+  bool supported_bdev_label() override { return false; }
+
+  static bool support(const std::string& path);
+
+  void aio_submit(IOContext *ioc) override;
+
+  int read(uint64_t off, uint64_t len, bufferlist *pbl,
+           IOContext *ioc,
+           bool buffered) override;
+  int aio_read(
+    uint64_t off,
+    uint64_t len,
+    bufferlist *pbl,
+    IOContext *ioc) override;
+  int aio_write(uint64_t off, bufferlist& bl,
+                IOContext *ioc,
+                bool buffered,
+		int write_hint = WRITE_LIFE_NOT_SET) override;
+  int write(uint64_t off, bufferlist& bl, bool buffered, int write_hint = WRITE_LIFE_NOT_SET) override;
+  int flush() override;
+  int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override;
+
+  // for managing buffered readers/writers
+  int invalidate_cache(uint64_t off, uint64_t len) override;
+  int open(const std::string& path) override;
+  void close() override;
+  int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const override;
+};
+
+#endif