Adding upstream version 14.2.21.upstream/14.2.21 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
commit: 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree: e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/os/filestore
parent: Initial commit. (diff)
download: ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
32 files changed, 19818 insertions, 0 deletions
diff --git a/src/os/filestore/BtrfsFileStoreBackend.cc b/src/os/filestore/BtrfsFileStoreBackend.cc
new file mode 100644
index 00000000..2ff2000d
--- /dev/null
+++ b/src/os/filestore/BtrfsFileStoreBackend.cc
@@ -0,0 +1,575 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/ceph_assert.h"
+
+#ifndef __CYGWIN__
+#include "os/fs/btrfs_ioctl.h"
+#endif
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "BtrfsFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+
+#if defined(__linux__)
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
+    GenericFileStoreBackend(fs), has_clone_range(false),
+    has_snap_create(false), has_snap_destroy(false),
+    has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
+    m_filestore_btrfs_clone_range(cct()->_conf->filestore_btrfs_clone_range),
+    m_filestore_btrfs_snap (cct()->_conf->filestore_btrfs_snap) { }
+
+int BtrfsFileStoreBackend::detect_features()
+{
+  int r;
+
+  r = GenericFileStoreBackend::detect_features();
+  if (r < 0)
+    return r;
+
+  // clone_range?
+  if (m_filestore_btrfs_clone_range) {
+    int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY|O_CLOEXEC, 0600);
+    if (fd >= 0) {
+      if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) {
+	r = -errno;
+	dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
+		<< cpp_strerror(r) << dendl;
+      }
+      btrfs_ioctl_clone_range_args clone_args;
+      memset(&clone_args, 0, sizeof(clone_args));
+      clone_args.src_fd = -1;
+      r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args);
+      if (r < 0 && errno == EBADF) {
+	dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl;
+	has_clone_range = true;
+      } else {
+	r = -errno;
+	dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
+      }
+      TEMP_FAILURE_RETRY(::close(fd));
+    } else {
+      r = -errno;
+      dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
+	      << cpp_strerror(r) << dendl;
+    }
+  } else {
+    dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
+  }
+
+  struct btrfs_ioctl_vol_args vol_args;
+  memset(&vol_args, 0, sizeof(vol_args));
+
+  // create test source volume
+  vol_args.fd = 0;
+  strcpy(vol_args.name, "test_subvol");
+  r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args);
+  if (r != 0) {
+    r = -errno;
+    dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+  }
+  int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY|O_CLOEXEC);
+  if (srcfd < 0) {
+    r = -errno;
+    dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+  }
+
+  // snap_create and snap_destroy?
+  vol_args.fd = srcfd;
+  strcpy(vol_args.name, "sync_snap_test");
+  r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+  int err = errno;
+  if (r == 0 || errno == EEXIST) {
+    dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl;
+    has_snap_create = true;
+
+    r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+    if (r == 0) {
+      dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl;
+      has_snap_destroy = true;
+    } else {
+      err = -errno;
+      dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+
+      if (err == -EPERM && getuid() != 0) {
+	dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
+	cerr << TEXT_YELLOW
+	     << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
+	     << TEXT_NORMAL << std::endl;
+      } else if (err == -EOPNOTSUPP) {
+	derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
+      }
+    }
+  } else {
+    dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
+  }
+
+  if (m_filestore_btrfs_snap) {
+    if (has_snap_destroy)
+      stable_commits = true;
+    else
+      dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
+  }
+
+  // start_sync?
+  __u64 transid = 0;
+  r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid);
+  if (r < 0) {
+    int err = errno;
+    dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl;
+  }
+  if (r == 0 && transid > 0) {
+    dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl;
+
+    // do we have wait_sync too?
+    r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+    if (r == 0 || errno == ERANGE) {
+      dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl;
+      has_wait_sync = true;
+    } else {
+      int err = errno;
+      dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+    }
+  } else {
+    int err = errno;
+    dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+  }
+
+  if (has_wait_sync) {
+    // async snap creation?
+    struct btrfs_ioctl_vol_args_v2 async_args;
+    memset(&async_args, 0, sizeof(async_args));
+    async_args.fd = srcfd;
+    async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+    strcpy(async_args.name, "async_snap_test");
+
+    // remove old one, first
+    struct stat st;
+    strcpy(vol_args.name, async_args.name);
+    if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) {
+      dout(0) << "detect_feature: removing old async_snap_test" << dendl;
+      r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+      if (r != 0) {
+	int err = errno;
+	dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
+      }
+    }
+
+    r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+    if (r == 0 || errno == EEXIST) {
+      dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl;
+      has_snap_create_v2 = true;
+
+      // clean up
+      strcpy(vol_args.name, "async_snap_test");
+      r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+      if (r != 0) {
+	int err = errno;
+	dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+      }
+    } else {
+      int err = errno;
+      dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl;
+    }
+  }
+
+  // clean up test subvol
+  if (srcfd >= 0)
+    TEMP_FAILURE_RETRY(::close(srcfd));
+
+  strcpy(vol_args.name, "test_subvol");
+  r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+  if (r < 0) {
+    r = -errno;
+    dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+  }
+
+  if (m_filestore_btrfs_snap && !has_snap_create_v2) {
+    dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
+    cerr << TEXT_YELLOW
+      << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
+      << "             rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
+      << "             (added in Linux 2.6.37).  Expect slow btrfs sync/commit\n"
+      << "             performance.\n"
+      << TEXT_NORMAL;
+  }
+
+  return 0;
+}
+
+bool BtrfsFileStoreBackend::can_checkpoint()
+{
+  return stable_commits;
+}
+
+int BtrfsFileStoreBackend::create_current()
+{
+  struct stat st;
+  int ret = ::stat(get_current_path().c_str(), &st);
+  if (ret == 0) {
+    // current/ exists
+    if (!S_ISDIR(st.st_mode)) {
+      dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+      return -EINVAL;
+    }
+
+    struct stat basest;
+    struct statfs currentfs;
+    ret = ::fstat(get_basedir_fd(), &basest);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+      return ret;
+    }
+    ret = ::statfs(get_current_path().c_str(), &currentfs);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl;
+      return ret;
+    }
+    if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) {
+      dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl;
+      stable_commits = true;
+    }
+    return 0;
+  }
+
+  struct btrfs_ioctl_vol_args volargs;
+  memset(&volargs, 0, sizeof(volargs));
+
+  volargs.fd = 0;
+  strcpy(volargs.name, "current");
+  if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
+    ret = -errno;
+    dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
+	    << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl;
+  if (::chmod(get_current_path().c_str(), 0755) < 0) {
+    ret = -errno;
+    dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
+	    << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  stable_commits = true;
+  return 0;
+}
+
+int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+  int ret, err = 0;
+
+  struct stat basest;
+  ret = ::fstat(get_basedir_fd(), &basest);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  // get snap list
+  DIR *dir = ::opendir(get_basedir_path().c_str());
+  if (!dir) {
+    ret = -errno;
+    dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
+	    << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  list<string> snaps;
+  char path[PATH_MAX];
+  struct dirent *de;
+  while ((de = ::readdir(dir))) {
+    snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name);
+
+    struct stat st;
+    ret = ::stat(path, &st);
+    if (ret < 0) {
+      err = -errno;
+      dout(0) << "list_checkpoints: stat '" << path << "' failed: "
+	      << cpp_strerror(err) << dendl;
+      break;
+    }
+
+    if (!S_ISDIR(st.st_mode))
+      continue;
+
+    struct statfs fs;
+    ret = ::statfs(path, &fs);
+    if (ret < 0) {
+      err = -errno;
+      dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
+	      << cpp_strerror(err) << dendl;
+      break;
+    }
+
+    if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
+      snaps.push_back(string(de->d_name));
+  }
+
+  if (::closedir(dir) < 0) {
+      ret = -errno;
+      dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
+      if (!err)
+	err = ret;
+  }
+
+  if (err)
+    return err;
+
+  ls.swap(snaps);
+  return 0;
+}
+
+int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid)
+{
+  dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+  if (has_snap_create_v2 && transid) {
+    struct btrfs_ioctl_vol_args_v2 async_args;
+    memset(&async_args, 0, sizeof(async_args));
+    async_args.fd = get_current_fd();
+    async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+
+    size_t name_size = sizeof(async_args.name);
+    strncpy(async_args.name, name.c_str(), name_size);
+    async_args.name[name_size-1] = '\0';
+
+    int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+    if (r < 0) {
+      r = -errno;
+      dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl;
+    *transid = async_args.transid;
+  } else {
+    struct btrfs_ioctl_vol_args vol_args;
+    memset(&vol_args, 0, sizeof(vol_args));
+    vol_args.fd = get_current_fd();
+
+    size_t name_size = sizeof(vol_args.name);
+    strncpy(vol_args.name, name.c_str(), name_size);
+    vol_args.name[name_size-1] = '\0';
+
+    int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+    if (r < 0) {
+      r = -errno;
+      dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    if (transid)
+      *transid = 0;
+  }
+  return 0;
+}
+
+int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid)
+{
+  // wait for commit
+  dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl;
+  int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl;
+    return -errno;
+  }
+  dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl;
+  return 0;
+}
+
+int BtrfsFileStoreBackend::rollback_to(const string& name)
+{
+  dout(10) << "rollback_to: to '" << name << "'" << dendl;
+  char s[PATH_MAX];
+  btrfs_ioctl_vol_args vol_args;
+
+  memset(&vol_args, 0, sizeof(vol_args));
+  vol_args.fd = 0;
+  strcpy(vol_args.name, "current");
+
+  int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+  if (ret && errno != ENOENT) {
+    dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl;
+    snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
+    if (::rename(get_current_path().c_str(), s)) {
+      ret = -errno;
+      dout(0) << "rollback_to: error renaming old current subvol: "
+	      << cpp_strerror(ret) << dendl;
+      return ret;
+    }
+  }
+
+  snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str());
+
+  // roll back
+  vol_args.fd = ::open(s, O_RDONLY|O_CLOEXEC);
+  if (vol_args.fd < 0) {
+    ret = -errno;
+    dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+  if (ret < 0 ) {
+    ret = -errno;
+    dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl;
+  }
+  TEMP_FAILURE_RETRY(::close(vol_args.fd));
+  return ret;
+}
+
+int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
+{
+  dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+  btrfs_ioctl_vol_args vol_args;
+  memset(&vol_args, 0, sizeof(vol_args));
+  vol_args.fd = 0;
+  strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name));
+
+  int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+  if (ret) {
+    ret = -errno;
+    dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int BtrfsFileStoreBackend::syncfs()
+{
+  dout(15) << "syncfs" << dendl;
+  // do a full btrfs commit
+  int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl;
+  }
+  return ret;
+}
+
+int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+  dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl;
+  size_t blk_size = get_blksize();
+  if (!has_clone_range ||
+      srcoff % blk_size != dstoff % blk_size) {
+    dout(20) << "clone_range: using copy" << dendl;
+    return _copy_range(from, to, srcoff, len, dstoff);
+  }
+
+  int err = 0;
+  int r = 0;
+
+  uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
+  uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
+  if (srcoffclone >= srcoff + len) {
+    dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl;
+    return _copy_range(from, to, srcoff, len, dstoff);
+  }
+
+  uint64_t lenclone = len - (srcoffclone - srcoff);
+  if (!ALIGNED(lenclone, blk_size)) {
+    struct stat from_stat, to_stat;
+    err = ::fstat(from, &from_stat);
+    if (err) return -errno;
+    err = ::fstat(to , &to_stat);
+    if (err) return -errno;
+
+    if (srcoff + len != (uint64_t)from_stat.st_size ||
+	dstoff + len < (uint64_t)to_stat.st_size) {
+      // Not to the end of the file, need to align length as well
+      lenclone = ALIGN_DOWN(lenclone, blk_size);
+    }
+  }
+  if (lenclone == 0) {
+    // too short
+    return _copy_range(from, to, srcoff, len, dstoff);
+  }
+
+  dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone
+	   << " to " << dstoffclone << " = " << r << dendl;
+  btrfs_ioctl_clone_range_args a;
+  a.src_fd = from;
+  a.src_offset = srcoffclone;
+  a.src_length = lenclone;
+  a.dest_offset = dstoffclone;
+  err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
+  if (err >= 0) {
+    r += err;
+  } else if (errno == EINVAL) {
+    // Still failed, might be compressed
+    dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
+    return _copy_range(from, to, srcoff, len, dstoff);
+  } else {
+    return -errno;
+  }
+
+  // Take care any trimmed from front
+  if (srcoffclone != srcoff) {
+    err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
+    if (err >= 0) {
+      r += err;
+    } else {
+      return err;
+    }
+  }
+
+  // Copy end
+  if (srcoffclone + lenclone != srcoff + len) {
+    err = _copy_range(from, to,
+			 srcoffclone + lenclone,
+			 (srcoff + len) - (srcoffclone + lenclone),
+			 dstoffclone + lenclone);
+    if (err >= 0) {
+      r += err;
+    } else {
+      return err;
+    }
+  }
+  dout(20) << "clone_range: finished " << srcoff << "~" << len
+	   << " to " << dstoff << " = " << r << dendl;
+  return r;
+}
+#endif
diff --git a/src/os/filestore/BtrfsFileStoreBackend.h b/src/os/filestore/BtrfsFileStoreBackend.h
new file mode 100644
index 00000000..0794be2d
--- /dev/null
+++ b/src/os/filestore/BtrfsFileStoreBackend.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BTRFSFILESTOREBACKEDN_H
+#define CEPH_BTRFSFILESTOREBACKEDN_H
+
+#if defined(__linux__)
+#include "GenericFileStoreBackend.h"
+
+class BtrfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+  bool has_clone_range;       ///< clone range ioctl is supported
+  bool has_snap_create;       ///< snap create ioctl is supported
+  bool has_snap_destroy;      ///< snap destroy ioctl is supported
+  bool has_snap_create_v2;    ///< snap create v2 ioctl (async!) is supported
+  bool has_wait_sync;         ///< wait sync ioctl is supported
+  bool stable_commits;
+  bool m_filestore_btrfs_clone_range;
+  bool m_filestore_btrfs_snap;
+public:
+  explicit BtrfsFileStoreBackend(FileStore *fs);
+  ~BtrfsFileStoreBackend() override {}
+  const char *get_name() override {
+    return "btrfs";
+  }
+  int detect_features() override;
+  bool can_checkpoint() override;
+  int create_current() override;
+  int list_checkpoints(list<string>& ls) override;
+  int create_checkpoint(const string& name, uint64_t *cid) override;
+  int sync_checkpoint(uint64_t cid) override;
+  int rollback_to(const string& name) override;
+  int destroy_checkpoint(const string& name) override;
+  int syncfs() override;
+  int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override;
+};
+#endif
+#endif
diff --git a/src/os/filestore/CollectionIndex.h b/src/os/filestore/CollectionIndex.h
new file mode 100644
index 00000000..eb43e47d
--- /dev/null
+++ b/src/os/filestore/CollectionIndex.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef OS_COLLECTIONINDEX_H
+#define OS_COLLECTIONINDEX_H
+
+#include <string>
+#include <vector>
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/RWLock.h"
+
+/**
+  CollectionIndex provides an interface for manipulating indexed collections
+ */
+class CollectionIndex {
+public:
+  CephContext* cct;
+protected:
+  /**
+   * Object encapsulating a returned path.
+   *
+   * A path to an object (existent or non-existent) becomes invalid
+   * when a different object is created in the index.  Path stores
+   * a shared_ptr to the CollectionIndex to keep the index alive
+   * during its lifetime.
+   * @see IndexManager
+   * @see self_ref
+   * @see set_ref
+   */
+  class Path {
+  public:
+    /// Returned path
+    string full_path;
+    /// Ref to parent Index
+    CollectionIndex* parent_ref;
+    /// coll_t for parent Index
+    coll_t parent_coll;
+
+    /// Normal Constructor
+    Path(
+      string path,                              ///< [in] Path to return.
+      CollectionIndex* ref)
+      : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {}
+
+    /// Debugging Constructor
+    Path(
+      string path,                              ///< [in] Path to return.
+      const coll_t& coll)                              ///< [in] collection
+      : full_path(path), parent_coll(coll) {}
+
+    /// Getter for the stored path.
+    const char *path() const { return full_path.c_str(); }
+
+    /// Getter for collection
+    const coll_t& coll() const { return parent_coll; }
+
+    /// Getter for parent
+    CollectionIndex* get_index() const {
+      return parent_ref;
+    }
+  };
+ public:
+
+  RWLock access_lock;
+  /// Type of returned paths
+  typedef std::shared_ptr<Path> IndexedPath;
+
+  static IndexedPath get_testing_path(string path, coll_t collection) {
+    return std::make_shared<Path>(path, collection);
+  }
+
+  static const uint32_t FLAT_INDEX_TAG = 0;
+  static const uint32_t HASH_INDEX_TAG = 1;
+  static const uint32_t HASH_INDEX_TAG_2 = 2;
+  static const uint32_t HOBJECT_WITH_POOL = 3;
+  /**
+   * For tracking Filestore collection versions.
+   *
+   * @return Collection version represented by the Index implementation
+   */
+  virtual uint32_t collection_version() = 0;
+
+  /**
+   * Returns the collection managed by this CollectionIndex
+   */
+  virtual coll_t coll() const = 0;
+
+
+  /**
+   * Initializes the index.
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int init() = 0;
+
+  /**
+   * Cleanup before replaying journal
+   *
+   * Index implementations may need to perform compound operations
+   * which may leave the collection unstable if interrupted.  cleanup
+   * is called on mount to allow the CollectionIndex implementation
+   * to stabilize.
+   *
+   * @see HashIndex
+   * @return Error Code, 0 for success
+   */
+  virtual int cleanup() = 0;
+
+  /**
+   * Call when a file is created using a path returned from lookup.
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int created(
+    const ghobject_t &oid, ///< [in] Created object.
+    const char *path       ///< [in] Path to created object.
+    ) = 0;
+
+  /**
+   * Removes oid from the collection
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int unlink(
+    const ghobject_t &oid ///< [in] Object to remove
+    ) = 0;
+
+  /**
+   * Gets the IndexedPath for oid.
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int lookup(
+    const ghobject_t &oid, ///< [in] Object to lookup
+    IndexedPath *path,	   ///< [out] Path to object
+    int *hardlink          ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist.
+    ) = 0;
+
+  /**
+   * Moves objects matching @e match in the lsb @e bits
+   *
+   * dest and this must be the same subclass
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int split(
+    uint32_t match,                             //< [in] value to match
+    uint32_t bits,                              //< [in] bits to check
+    CollectionIndex* dest  //< [in] destination index
+    ) { ceph_abort(); return 0; }
+
+  virtual int merge(
+    uint32_t bits,                              //< [in] common (target) bits
+    CollectionIndex* dest  //< [in] destination index
+    ) { ceph_abort(); return 0; }
+
+
+  /// List contents of collection by hash
+  virtual int collection_list_partial(
+    const ghobject_t &start, ///< [in] object at which to start
+    const ghobject_t &end,    ///< [in] list only objects < end
+    int max_count,          ///< [in] return at most max_count objects
+    vector<ghobject_t> *ls,  ///< [out] Listed objects
+    ghobject_t *next         ///< [out] Next object to list
+    ) = 0;
+
+  /// Call prior to removing directory
+  virtual int prep_delete() { return 0; }
+
+  CollectionIndex(CephContext* cct, const coll_t& collection)
+    : cct(cct), access_lock("CollectionIndex::access_lock", true, false) {}
+
+  /*
+   * Pre-hash the collection, this collection should map to a PG folder.
+   *
+   * @param pg_num            - pg number of the pool this collection belongs to.
+   * @param expected_num_objs - expected number of objects in this collection.
+   * @Return 0 on success, an error code otherwise.
+   */
+  virtual int pre_hash_collection(
+      uint32_t pg_num,            ///< [in] pg number of the pool this collection belongs to
+      uint64_t expected_num_objs  ///< [in] expected number of objects this collection has
+      ) { ceph_abort(); return 0; }
+
+  virtual int apply_layout_settings(int target_level) { ceph_abort(); return 0; }
+
+  /// Read index-wide settings (should be called after construction)
+  virtual int read_settings() { return 0; }
+
+  /// Virtual destructor
+  virtual ~CollectionIndex() {}
+};
+
+#endif
diff --git a/src/os/filestore/DBObjectMap.cc b/src/os/filestore/DBObjectMap.cc
new file mode 100644
index 00000000..5a057014
--- /dev/null
+++ b/src/os/filestore/DBObjectMap.cc
@@ -0,0 +1,1415 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+#include <iostream>
+#include <set>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "DBObjectMap.h"
+#include <errno.h>
+
+#include "common/debug.h"
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore "
+
+const string DBObjectMap::USER_PREFIX = "_USER_";
+const string DBObjectMap::XATTR_PREFIX = "_AXATTR_";
+const string DBObjectMap::SYS_PREFIX = "_SYS_";
+const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_";
+const string DBObjectMap::HEADER_KEY = "HEADER";
+const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER";
+const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER";
+const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_";
+
+// Legacy
+const string DBObjectMap::LEAF_PREFIX = "_LEAF_";
+const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_";
+
+static void append_escaped(const string &in, string *out)
+{
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if (*i == '%') {
+      out->push_back('%');
+      out->push_back('p');
+    } else if (*i == '.') {
+      out->push_back('%');
+      out->push_back('e');
+    } else if (*i == '_') {
+      out->push_back('%');
+      out->push_back('u');
+    } else {
+      out->push_back(*i);
+    }
+  }
+}
+
+int DBObjectMap::check(std::ostream &out, bool repair, bool force)
+{
+  int errors = 0, comp_errors = 0;
+  bool repaired = false;
+  map<uint64_t, uint64_t> parent_to_num_children;
+  map<uint64_t, uint64_t> parent_to_actual_num_children;
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    _Header header;
+    bufferlist bl = iter->value();
+    while (true) {
+      auto bliter = bl.cbegin();
+      header.decode(bliter);
+      if (header.seq != 0)
+	parent_to_actual_num_children[header.seq] = header.num_children;
+
+      if (state.v == 2 || force) {
+	// Check complete table
+	bool complete_error = false;
+	boost::optional<string> prev;
+	KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+	for (complete_iter->seek_to_first(); complete_iter->valid();
+	     complete_iter->next()) {
+	  if (prev && prev >= complete_iter->key()) {
+	     out << "Bad complete for " << header.oid << std::endl;
+	     complete_error = true;
+	     break;
+	  }
+	  prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
+	}
+	if (complete_error) {
+	  out << "Complete mapping for " << header.seq << " :" << std::endl;
+	  for (complete_iter->seek_to_first(); complete_iter->valid();
+	       complete_iter->next()) {
+	    out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
+	  }
+	  if (repair) {
+	    repaired = true;
+	    KeyValueDB::Transaction t = db->get_transaction();
+	    t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+	    db->submit_transaction(t);
+	    out << "Cleared complete mapping to repair" << std::endl;
+	  } else {
+	    errors++;  // Only count when not repaired
+	    comp_errors++;  // Track errors here for version update
+	  }
+	}
+      }
+
+      if (header.parent == 0)
+	break;
+
+      if (!parent_to_num_children.count(header.parent))
+	parent_to_num_children[header.parent] = 0;
+      parent_to_num_children[header.parent]++;
+      if (parent_to_actual_num_children.count(header.parent))
+	break;
+
+      set<string> to_get;
+      map<string, bufferlist> got;
+      to_get.insert(HEADER_KEY);
+      db->get(sys_parent_prefix(header), to_get, &got);
+      if (got.empty()) {
+	out << "Missing: seq " << header.parent << std::endl;
+	errors++;
+	break;
+      } else {
+	bl = got.begin()->second;
+      }
+    }
+  }
+
+  for (map<uint64_t, uint64_t>::iterator i = parent_to_num_children.begin();
+       i != parent_to_num_children.end();
+       parent_to_num_children.erase(i++)) {
+    if (!parent_to_actual_num_children.count(i->first))
+      continue;
+    if (parent_to_actual_num_children[i->first] != i->second) {
+      out << "Invalid: seq " << i->first << " recorded children: "
+	  << parent_to_actual_num_children[i->first] << " found: "
+	  << i->second << std::endl;
+      errors++;
+    }
+    parent_to_actual_num_children.erase(i->first);
+  }
+
+  // Only advance the version from 2 to 3 here
+  // Mark as legacy because there are still older structures
+  // we don't update.  The value of legacy is only used
+  // for internal assertions.
+  if (comp_errors == 0 && state.v == 2 && repair) {
+    state.v = 3;
+    state.legacy = true;
+    set_state();
+  }
+
+  if (errors == 0 && repaired)
+    return -1;
+  return errors;
+}
+
+string DBObjectMap::ghobject_key(const ghobject_t &oid)
+{
+  string out;
+  append_escaped(oid.hobj.oid.name, &out);
+  out.push_back('.');
+  append_escaped(oid.hobj.get_key(), &out);
+  out.push_back('.');
+  append_escaped(oid.hobj.nspace, &out);
+  out.push_back('.');
+
+  char snap_with_hash[1000];
+  char *t = snap_with_hash;
+  char *end = t + sizeof(snap_with_hash);
+  if (oid.hobj.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "head");
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "snapdir");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+
+  if (oid.hobj.pool == -1)
+    t += snprintf(t, end - t, ".none");
+  else
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
+  t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash());
+
+  if (oid.generation != ghobject_t::NO_GEN ||
+      oid.shard_id != shard_id_t::NO_SHARD) {
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
+    t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
+  }
+  out += string(snap_with_hash);
+  return out;
+}
+
+//    ok: pglog%u3%efs1...0.none.0017B237
+//   bad: plana8923501-10...4c.3.ffffffffffffffff.2
+// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2
+// returns 0 for false, 1 for true, negative for error
+int DBObjectMap::is_buggy_ghobject_key_v1(CephContext* cct,
+					  const string &in)
+{
+  int dots = 5;  // skip 5 .'s
+  const char *s = in.c_str();
+  do {
+    while (*s && *s != '.')
+      ++s;
+    if (!*s) {
+      derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+      return -EINVAL;
+    }
+    ++s;
+  } while (*s && --dots);
+  if (!*s) {
+    derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+    return -EINVAL;
+  }
+  // we are now either at a hash value (32 bits, 8 chars) or a generation
+  // value (64 bits) '.' and shard id.  count the dots!
+  int len = 0;
+  while (*s && *s != '.') {
+    ++s;
+    ++len;
+  }
+  if (*s == '\0') {
+    if (len != 8) {
+      derr << "hash value is not 8 chars" << dendl;
+      return -EINVAL;  // the hash value is always 8 chars.
+    }
+    return 0;
+  }
+  if (*s != '.') { // the shard follows.
+    derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl;
+    return -EINVAL;
+  }
+  return 1;
+}
+
+
+string DBObjectMap::map_header_key(const ghobject_t &oid)
+{
+  return ghobject_key(oid);
+}
+
+string DBObjectMap::header_key(uint64_t seq)
+{
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq);
+  return string(buf);
+}
+
+string DBObjectMap::complete_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX;
+}
+
+string DBObjectMap::user_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + USER_PREFIX;
+}
+
+string DBObjectMap::sys_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + SYS_PREFIX;
+}
+
+string DBObjectMap::xattr_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX;
+}
+
+string DBObjectMap::sys_parent_prefix(_Header header)
+{
+  return USER_PREFIX + header_key(header.parent) + SYS_PREFIX;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::init()
+{
+  invalid = false;
+  if (ready) {
+    return 0;
+  }
+  ceph_assert(!parent_iter);
+  if (header->parent) {
+    Header parent = map->lookup_parent(header);
+    if (!parent) {
+      ceph_abort();
+      return -EINVAL;
+    }
+    parent_iter = std::make_shared<DBObjectMapIteratorImpl>(map, parent);
+  }
+  key_iter = map->db->get_iterator(map->user_prefix(header));
+  ceph_assert(key_iter);
+  complete_iter = map->db->get_iterator(map->complete_prefix(header));
+  ceph_assert(complete_iter);
+  cur_iter = key_iter;
+  ceph_assert(cur_iter);
+  ready = true;
+  return 0;
+}
+
+ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
+  const ghobject_t &oid)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return ObjectMapIterator(new EmptyIteratorImpl());
+  DBObjectMapIterator iter = _get_iterator(header);
+  iter->hlock.swap(hl);
+  return iter;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first()
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->seek_to_first();
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->seek_to_first();
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last()
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->seek_to_last();
+    if (r < 0)
+      return r;
+    if (parent_iter->valid())
+      r = parent_iter->next();
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->seek_to_last();
+  if (r < 0)
+    return r;
+  if (key_iter->valid())
+    r = key_iter->next();
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to)
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->lower_bound(to);
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->lower_bound(to);
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound_parent(const string &to)
+{
+  int r = lower_bound(to);
+  if (r < 0)
+    return r;
+  if (valid() && !on_parent())
+    return next_parent();
+  else
+    return r;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after)
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->upper_bound(after);
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->upper_bound(after);
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid()
+{
+  bool valid = !invalid && ready;
+  ceph_assert(!valid || cur_iter->valid());
+  return valid;
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent()
+{
+  if (parent_iter && parent_iter->valid() &&
+      (!key_iter->valid() || key_iter->key() > parent_iter->key()))
+    return true;
+  return false;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next()
+{
+  ceph_assert(cur_iter->valid());
+  ceph_assert(valid());
+  cur_iter->next();
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next_parent()
+{
+  r = next();
+  if (r < 0)
+    return r;
+  while (parent_iter && parent_iter->valid() && !on_parent()) {
+    ceph_assert(valid());
+    r = lower_bound(parent_iter->key());
+    if (r < 0)
+      return r;
+  }
+
+  if (!parent_iter || !parent_iter->valid()) {
+    invalid = true;
+  }
+  return 0;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test,
+							     string *begin,
+							     string *end)
+{
+  /* This is clumsy because one cannot call prev() on end(), nor can one
+   * test for == begin().
+   */
+  complete_iter->upper_bound(to_test);
+  if (complete_iter->valid()) {
+    complete_iter->prev();
+    if (!complete_iter->valid()) {
+      complete_iter->upper_bound(to_test);
+      return false;
+    }
+  } else {
+    complete_iter->seek_to_last();
+    if (!complete_iter->valid())
+      return false;
+  }
+
+  ceph_assert(complete_iter->key() <= to_test);
+  ceph_assert(complete_iter->value().length() >= 1);
+  string _end(complete_iter->value().c_str(),
+	      complete_iter->value().length() - 1);
+  if (_end.empty() || _end > to_test) {
+    if (begin)
+      *begin = complete_iter->key();
+    if (end)
+      *end = _end;
+    return true;
+  } else {
+    complete_iter->next();
+    ceph_assert(!complete_iter->valid() || complete_iter->key() > to_test);
+    return false;
+  }
+}
+
+/**
+ * Moves parent_iter to the next position both out of the complete_region and
+ * not equal to key_iter.  Then, we set cur_iter to parent_iter if valid and
+ * less than key_iter and key_iter otherwise.
+ */
+int DBObjectMap::DBObjectMapIteratorImpl::adjust()
+{
+  string begin, end;
+  while (parent_iter && parent_iter->valid()) {
+    if (in_complete_region(parent_iter->key(), &begin, &end)) {
+      if (end.size() == 0) {
+	parent_iter->seek_to_last();
+	if (parent_iter->valid())
+	  parent_iter->next();
+      } else
+	parent_iter->lower_bound(end);
+    } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) {
+      parent_iter->next();
+    } else {
+      break;
+    }
+  }
+  if (valid_parent()) {
+    cur_iter = parent_iter;
+  } else if (key_iter->valid()) {
+    cur_iter = key_iter;
+  } else {
+    invalid = true;
+  }
+  ceph_assert(invalid || cur_iter->valid());
+  return 0;
+}
+
+
+string DBObjectMap::DBObjectMapIteratorImpl::key()
+{
+  return cur_iter->key();
+}
+
+bufferlist DBObjectMap::DBObjectMapIteratorImpl::value()
+{
+  return cur_iter->value();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::status()
+{
+  return r;
+}
+
+int DBObjectMap::set_keys(const ghobject_t &oid,
+			  const map<string, bufferlist> &set,
+			  const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_create_map_header(hl, oid, t);
+  if (!header)
+    return -EINVAL;
+  if (check_spos(oid, header, spos))
+    return 0;
+
+  t->set(user_prefix(header), set);
+
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::set_header(const ghobject_t &oid,
+			    const bufferlist &bl,
+			    const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_create_map_header(hl, oid, t);
+  if (!header)
+    return -EINVAL;
+  if (check_spos(oid, header, spos))
+    return 0;
+  _set_header(header, bl, t);
+  return db->submit_transaction(t);
+}
+
+void DBObjectMap::_set_header(Header header, const bufferlist &bl,
+			      KeyValueDB::Transaction t)
+{
+  map<string, bufferlist> to_set;
+  to_set[USER_HEADER_KEY] = bl;
+  t->set(sys_prefix(header), to_set);
+}
+
+int DBObjectMap::get_header(const ghobject_t &oid,
+			    bufferlist *bl)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header) {
+    return 0;
+  }
+  return _get_header(header, bl);
+}
+
+int DBObjectMap::_get_header(Header header,
+			     bufferlist *bl)
+{
+  map<string, bufferlist> out;
+  while (true) {
+    out.clear();
+    set<string> to_get;
+    to_get.insert(USER_HEADER_KEY);
+    int r = db->get(sys_prefix(header), to_get, &out);
+    if (r == 0 && !out.empty())
+      break;
+    if (r < 0)
+      return r;
+    Header current(header);
+    if (!current->parent)
+      break;
+    header = lookup_parent(current);
+  }
+
+  if (!out.empty())
+    bl->swap(out.begin()->second);
+  return 0;
+}
+
+int DBObjectMap::clear(const ghobject_t &oid,
+		       const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  if (check_spos(oid, header, spos))
+    return 0;
+  remove_map_header(hl, oid, header, t);
+  ceph_assert(header->num_children > 0);
+  header->num_children--;
+  int r = _clear(header, t);
+  if (r < 0)
+    return r;
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::_clear(Header header,
+			KeyValueDB::Transaction t)
+{
+  while (1) {
+    if (header->num_children) {
+      set_header(header, t);
+      break;
+    }
+    clear_header(header, t);
+    if (!header->parent)
+      break;
+    Header parent = lookup_parent(header);
+    if (!parent) {
+      return -EINVAL;
+    }
+    ceph_assert(parent->num_children > 0);
+    parent->num_children--;
+    header.swap(parent);
+  }
+  return 0;
+}
+
+int DBObjectMap::copy_up_header(Header header,
+				KeyValueDB::Transaction t)
+{
+  bufferlist bl;
+  int r = _get_header(header, &bl);
+  if (r < 0)
+    return r;
+
+  _set_header(header, bl, t);
+  return 0;
+}
+
+int DBObjectMap::rm_keys(const ghobject_t &oid,
+			 const set<string> &to_clear,
+			 const SequencerPosition *spos)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  KeyValueDB::Transaction t = db->get_transaction();
+  if (check_spos(oid, header, spos))
+    return 0;
+  t->rmkeys(user_prefix(header), to_clear);
+  if (!header->parent) {
+    return db->submit_transaction(t);
+  }
+
+  ceph_assert(state.legacy);
+
+  {
+    // We only get here for legacy (v2) stores
+    // Copy up all keys from parent excluding to_clear
+    // and remove parent
+    // This eliminates a v2 format use of complete for this oid only
+    map<string, bufferlist> to_write;
+    ObjectMapIterator iter = _get_iterator(header);
+    for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+      if (iter->status())
+        return iter->status();
+      if (!to_clear.count(iter->key()))
+        to_write[iter->key()] = iter->value();
+    }
+    t->set(user_prefix(header), to_write);
+  } // destruct iter which has parent in_use
+
+  copy_up_header(header, t);
+  Header parent = lookup_parent(header);
+  if (!parent)
+    return -EINVAL;
+  parent->num_children--;
+  _clear(parent, t);
+  header->parent = 0;
+  set_map_header(hl, oid, *header, t);
+  t->rmkeys_by_prefix(complete_prefix(header));
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::clear_keys_header(const ghobject_t &oid,
+				   const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  if (check_spos(oid, header, spos))
+    return 0;
+
+  // save old attrs
+  KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+  if (!iter)
+    return -EINVAL;
+  map<string, bufferlist> attrs;
+  for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+    attrs.insert(make_pair(iter->key(), iter->value()));
+  if (iter->status())
+    return iter->status();
+
+  // remove current header
+  remove_map_header(hl, oid, header, t);
+  ceph_assert(header->num_children > 0);
+  header->num_children--;
+  int r = _clear(header, t);
+  if (r < 0)
+    return r;
+
+  // create new header
+  Header newheader = generate_new_header(oid, Header());
+  set_map_header(hl, oid, *newheader, t);
+  if (!attrs.empty())
+    t->set(xattr_prefix(newheader), attrs);
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::get(const ghobject_t &oid,
+		     bufferlist *_header,
+		     map<string, bufferlist> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  _get_header(header, _header);
+  ObjectMapIterator iter = _get_iterator(header);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    if (iter->status())
+      return iter->status();
+    out->insert(make_pair(iter->key(), iter->value()));
+  }
+  return 0;
+}
+
+int DBObjectMap::get_keys(const ghobject_t &oid,
+			  set<string> *keys)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  ObjectMapIterator iter = _get_iterator(header);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    if (iter->status())
+      return iter->status();
+    keys->insert(iter->key());
+  }
+  return 0;
+}
+
+int DBObjectMap::scan(Header header,
+		      const set<string> &in_keys,
+		      set<string> *out_keys,
+		      map<string, bufferlist> *out_values)
+{
+  ObjectMapIterator db_iter = _get_iterator(header);
+  for (set<string>::const_iterator key_iter = in_keys.begin();
+       key_iter != in_keys.end();
+       ++key_iter) {
+    db_iter->lower_bound(*key_iter);
+    if (db_iter->status())
+      return db_iter->status();
+    if (db_iter->valid() && db_iter->key() == *key_iter) {
+      if (out_keys)
+	out_keys->insert(*key_iter);
+      if (out_values)
+	out_values->insert(make_pair(db_iter->key(), db_iter->value()));
+    }
+  }
+  return 0;
+}
+
+int DBObjectMap::get_values(const ghobject_t &oid,
+			    const set<string> &keys,
+			    map<string, bufferlist> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  return scan(header, keys, 0, out);
+}
+
+int DBObjectMap::check_keys(const ghobject_t &oid,
+			    const set<string> &keys,
+			    set<string> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  return scan(header, keys, out, 0);
+}
+
+int DBObjectMap::get_xattrs(const ghobject_t &oid,
+			    const set<string> &to_get,
+			    map<string, bufferlist> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  return db->get(xattr_prefix(header), to_get, out);
+}
+
+int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
+				set<string> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+  if (!iter)
+    return -EINVAL;
+  for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+    out->insert(iter->key());
+  return iter->status();
+}
+
+int DBObjectMap::set_xattrs(const ghobject_t &oid,
+			    const map<string, bufferlist> &to_set,
+			    const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_create_map_header(hl, oid, t);
+  if (!header)
+    return -EINVAL;
+  if (check_spos(oid, header, spos))
+    return 0;
+  t->set(xattr_prefix(header), to_set);
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::remove_xattrs(const ghobject_t &oid,
+			       const set<string> &to_remove,
+			       const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  if (check_spos(oid, header, spos))
+    return 0;
+  t->rmkeys(xattr_prefix(header), to_remove);
+  return db->submit_transaction(t);
+}
+
+// ONLY USED FOR TESTING
+// Set version to 2 to avoid asserts
+int DBObjectMap::legacy_clone(const ghobject_t &oid,
+		       const ghobject_t &target,
+		       const SequencerPosition *spos)
+{
+  state.legacy = true;
+
+  if (oid == target)
+    return 0;
+
+  MapHeaderLock _l1(this, std::min(oid, target));
+  MapHeaderLock _l2(this, std::max(oid, target));
+  MapHeaderLock *lsource, *ltarget;
+  if (oid > target) {
+    lsource = &_l2;
+    ltarget= &_l1;
+  } else {
+    lsource = &_l1;
+    ltarget= &_l2;
+  }
+
+  KeyValueDB::Transaction t = db->get_transaction();
+  {
+    Header destination = lookup_map_header(*ltarget, target);
+    if (destination) {
+      if (check_spos(target, destination, spos))
+	return 0;
+      destination->num_children--;
+      remove_map_header(*ltarget, target, destination, t);
+      _clear(destination, t);
+    }
+  }
+
+  Header parent = lookup_map_header(*lsource, oid);
+  if (!parent)
+    return db->submit_transaction(t);
+
+  Header source = generate_new_header(oid, parent);
+  Header destination = generate_new_header(target, parent);
+  if (spos)
+    destination->spos = *spos;
+
+  parent->num_children = 2;
+  set_header(parent, t);
+  set_map_header(*lsource, oid, *source, t);
+  set_map_header(*ltarget, target, *destination, t);
+
+  map<string, bufferlist> to_set;
+  KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent));
+  for (xattr_iter->seek_to_first();
+       xattr_iter->valid();
+       xattr_iter->next())
+    to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+  t->set(xattr_prefix(source), to_set);
+  t->set(xattr_prefix(destination), to_set);
+  t->rmkeys_by_prefix(xattr_prefix(parent));
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::clone(const ghobject_t &oid,
+		       const ghobject_t &target,
+		       const SequencerPosition *spos)
+{
+  if (oid == target)
+    return 0;
+
+  MapHeaderLock _l1(this, std::min(oid, target));
+  MapHeaderLock _l2(this, std::max(oid, target));
+  MapHeaderLock *lsource, *ltarget;
+  if (oid > target) {
+    lsource = &_l2;
+    ltarget= &_l1;
+  } else {
+    lsource = &_l1;
+    ltarget= &_l2;
+  }
+
+  KeyValueDB::Transaction t = db->get_transaction();
+  {
+    Header destination = lookup_map_header(*ltarget, target);
+    if (destination) {
+      if (check_spos(target, destination, spos))
+	return 0;
+      destination->num_children--;
+      remove_map_header(*ltarget, target, destination, t);
+      _clear(destination, t);
+    }
+  }
+
+  Header source = lookup_map_header(*lsource, oid);
+  if (!source)
+    return db->submit_transaction(t);
+
+  Header destination = generate_new_header(target, Header());
+  if (spos)
+    destination->spos = *spos;
+
+  set_map_header(*ltarget, target, *destination, t);
+
+  bufferlist bl;
+  int r = _get_header(source, &bl);
+  if (r < 0)
+    return r;
+  _set_header(destination, bl, t);
+
+  map<string, bufferlist> to_set;
+  KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(source));
+  for (xattr_iter->seek_to_first();
+       xattr_iter->valid();
+       xattr_iter->next())
+    to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+  t->set(xattr_prefix(destination), to_set);
+
+  map<string, bufferlist> to_write;
+  ObjectMapIterator iter = _get_iterator(source);
+  for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+    if (iter->status())
+      return iter->status();
+    to_write[iter->key()] = iter->value();
+  }
+  t->set(user_prefix(destination), to_write);
+
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::upgrade_to_v2()
+{
+  dout(1) << __func__ << " start" << dendl;
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  iter->seek_to_first();
+  while (iter->valid()) {
+    unsigned count = 0;
+    KeyValueDB::Transaction t = db->get_transaction();
+    set<string> remove;
+    map<string, bufferlist> add;
+    for (;
+        iter->valid() && count < 300;
+        iter->next()) {
+      dout(20) << __func__ << " key is " << iter->key() << dendl;
+      int r = is_buggy_ghobject_key_v1(cct, iter->key());
+      if (r < 0) {
+	derr << __func__ << " bad key '" << iter->key() << "'" << dendl;
+	return r;
+      }
+      if (!r) {
+	dout(20) << __func__ << " " << iter->key() << " ok" << dendl;
+	continue;
+      }
+
+      // decode header to get oid
+      _Header hdr;
+      bufferlist bl = iter->value();
+      auto bliter = bl.cbegin();
+      hdr.decode(bliter);
+
+      string newkey(ghobject_key(hdr.oid));
+      dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl;
+      add[newkey] = iter->value();
+      remove.insert(iter->key());
+      ++count;
+    }
+
+    if (!remove.empty()) {
+      dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl;
+      t->rmkeys(HOBJECT_TO_SEQ, remove);
+      t->set(HOBJECT_TO_SEQ, add);
+      int r = db->submit_transaction(t);
+      if (r < 0)
+	return r;
+    }
+  }
+
+  state.v = 2;
+
+  set_state();
+  return 0;
+}
+
+void DBObjectMap::set_state()
+{
+  Mutex::Locker l(header_lock);
+  KeyValueDB::Transaction t = db->get_transaction();
+  write_state(t);
+  int ret = db->submit_transaction_sync(t);
+  ceph_assert(ret == 0);
+  dout(1) << __func__ << " done" << dendl;
+  return;
+}
+
+int DBObjectMap::get_state()
+{
+  map<string, bufferlist> result;
+  set<string> to_get;
+  to_get.insert(GLOBAL_STATE_KEY);
+  int r = db->get(SYS_PREFIX, to_get, &result);
+  if (r < 0)
+    return r;
+  if (!result.empty()) {
+    auto bliter = result.begin()->second.cbegin();
+    state.decode(bliter);
+  } else {
+    // New store
+    state.v = State::CUR_VERSION;
+    state.seq = 1;
+    state.legacy = false;
+  }
+  return 0;
+}
+
+int DBObjectMap::init(bool do_upgrade)
+{
+  int ret = get_state();
+  if (ret < 0)
+    return ret;
+  if (state.v < 1) {
+    dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
+	    << dendl;
+    return -ENOTSUP;
+  }
+  if (state.v < 2) { // Needs upgrade
+    if (!do_upgrade) {
+      dout(1) << "DOBjbectMap requires an upgrade,"
+	      << " set filestore_update_to"
+	      << dendl;
+      return -ENOTSUP;
+    } else {
+      int r = upgrade_to_v2();
+      if (r < 0)
+	return r;
+    }
+  }
+  ostringstream ss;
+  int errors = check(ss, true);
+  if (errors) {
+    derr << ss.str() << dendl;
+    if (errors > 0)
+      return -EINVAL;
+  }
+  dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl;
+  return 0;
+}
+
+int DBObjectMap::sync(const ghobject_t *oid,
+		      const SequencerPosition *spos) {
+  KeyValueDB::Transaction t = db->get_transaction();
+  if (oid) {
+    ceph_assert(spos);
+    MapHeaderLock hl(this, *oid);
+    Header header = lookup_map_header(hl, *oid);
+    if (header) {
+      dout(10) << "oid: " << *oid << " setting spos to "
+	       << *spos << dendl;
+      header->spos = *spos;
+      set_map_header(hl, *oid, *header, t);
+    }
+    /* It may appear that this and the identical portion of the else
+     * block can combined below, but in this block, the transaction
+     * must be submitted under *both* the MapHeaderLock and the full
+     * header_lock.
+     *
+     * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891.
+     */
+    Mutex::Locker l(header_lock);
+    write_state(t);
+    return db->submit_transaction_sync(t);
+  } else {
+    Mutex::Locker l(header_lock);
+    write_state(t);
+    return db->submit_transaction_sync(t);
+  }
+}
+
+int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
+  ceph_assert(header_lock.is_locked_by_me());
+  dout(20) << "dbobjectmap: seq is " << state.seq << dendl;
+  KeyValueDB::Transaction t = _t ? _t : db->get_transaction();
+  bufferlist bl;
+  state.encode(bl);
+  map<string, bufferlist> to_write;
+  to_write[GLOBAL_STATE_KEY] = bl;
+  t->set(SYS_PREFIX, to_write);
+  return _t ? 0 : db->submit_transaction(t);
+}
+
+
+DBObjectMap::Header DBObjectMap::_lookup_map_header(
+  const MapHeaderLock &l,
+  const ghobject_t &oid)
+{
+  ceph_assert(l.get_locked() == oid);
+
+  _Header *header = new _Header();
+  {
+    Mutex::Locker l(cache_lock);
+    if (caches.lookup(oid, header)) {
+      ceph_assert(!in_use.count(header->seq));
+      in_use.insert(header->seq);
+      return Header(header, RemoveOnDelete(this));
+    }
+  }
+
+  bufferlist out;
+  int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out);
+  if (r < 0 || out.length()==0) {
+    delete header;
+    return Header();
+  }
+
+  Header ret(header, RemoveOnDelete(this));
+  auto iter = out.cbegin();
+  ret->decode(iter);
+  {
+    Mutex::Locker l(cache_lock);
+    caches.add(oid, *ret);
+  }
+
+  ceph_assert(!in_use.count(header->seq));
+  in_use.insert(header->seq);
+  return ret;
+}
+
+DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
+						      Header parent)
+{
+  Header header = Header(new _Header(), RemoveOnDelete(this));
+  header->seq = state.seq++;
+  if (parent) {
+    header->parent = parent->seq;
+    header->spos = parent->spos;
+  }
+  header->num_children = 1;
+  header->oid = oid;
+  ceph_assert(!in_use.count(header->seq));
+  in_use.insert(header->seq);
+
+  write_state();
+  return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
+{
+  Mutex::Locker l(header_lock);
+  while (in_use.count(input->parent))
+    header_cond.Wait(header_lock);
+  map<string, bufferlist> out;
+  set<string> keys;
+  keys.insert(HEADER_KEY);
+
+  dout(20) << "lookup_parent: parent " << input->parent
+       << " for seq " << input->seq << dendl;
+  int r = db->get(sys_parent_prefix(input), keys, &out);
+  if (r < 0) {
+    ceph_abort();
+    return Header();
+  }
+  if (out.empty()) {
+    ceph_abort();
+    return Header();
+  }
+
+  Header header = Header(new _Header(), RemoveOnDelete(this));
+  auto iter = out.begin()->second.cbegin();
+  header->decode(iter);
+  ceph_assert(header->seq == input->parent);
+  dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
+       << header->parent << dendl;
+  in_use.insert(header->seq);
+  return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_create_map_header(
+  const MapHeaderLock &hl,
+  const ghobject_t &oid,
+  KeyValueDB::Transaction t)
+{
+  Mutex::Locker l(header_lock);
+  Header header = _lookup_map_header(hl, oid);
+  if (!header) {
+    header = _generate_new_header(oid, Header());
+    set_map_header(hl, oid, *header, t);
+  }
+  return header;
+}
+
+void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t)
+{
+  dout(20) << "clear_header: clearing seq " << header->seq << dendl;
+  t->rmkeys_by_prefix(user_prefix(header));
+  t->rmkeys_by_prefix(sys_prefix(header));
+  if (state.legacy)
+    t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0
+  t->rmkeys_by_prefix(xattr_prefix(header));
+  set<string> keys;
+  keys.insert(header_key(header->seq));
+  t->rmkeys(USER_PREFIX, keys);
+}
+
+void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
+{
+  dout(20) << "set_header: setting seq " << header->seq << dendl;
+  map<string, bufferlist> to_write;
+  header->encode(to_write[HEADER_KEY]);
+  t->set(sys_prefix(header), to_write);
+}
+
+void DBObjectMap::remove_map_header(
+  const MapHeaderLock &l,
+  const ghobject_t &oid,
+  Header header,
+  KeyValueDB::Transaction t)
+{
+  ceph_assert(l.get_locked() == oid);
+  dout(20) << "remove_map_header: removing " << header->seq
+	   << " oid " << oid << dendl;
+  set<string> to_remove;
+  to_remove.insert(map_header_key(oid));
+  t->rmkeys(HOBJECT_TO_SEQ, to_remove);
+  {
+    Mutex::Locker l(cache_lock);
+    caches.clear(oid);
+  }
+}
+
+void DBObjectMap::set_map_header(
+  const MapHeaderLock &l,
+  const ghobject_t &oid, _Header header,
+  KeyValueDB::Transaction t)
+{
+  ceph_assert(l.get_locked() == oid);
+  dout(20) << "set_map_header: setting " << header.seq
+	   << " oid " << oid << " parent seq "
+	   << header.parent << dendl;
+  map<string, bufferlist> to_set;
+  header.encode(to_set[map_header_key(oid)]);
+  t->set(HOBJECT_TO_SEQ, to_set);
+  {
+    Mutex::Locker l(cache_lock);
+    caches.add(oid, header);
+  }
+}
+
+bool DBObjectMap::check_spos(const ghobject_t &oid,
+			     Header header,
+			     const SequencerPosition *spos)
+{
+  if (!spos || *spos > header->spos) {
+    stringstream out;
+    if (spos)
+      dout(10) << "oid: " << oid << " not skipping op, *spos "
+	       << *spos << dendl;
+    else
+      dout(10) << "oid: " << oid << " not skipping op, *spos "
+	       << "empty" << dendl;
+    dout(10) << " > header.spos " << header->spos << dendl;
+    return false;
+  } else {
+    dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
+	     << " <= header.spos " << header->spos << dendl;
+    return true;
+  }
+}
+
+int DBObjectMap::list_objects(vector<ghobject_t> *out)
+{
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    bufferlist bl = iter->value();
+    auto bliter = bl.cbegin();
+    _Header header;
+    header.decode(bliter);
+    out->push_back(header.oid);
+  }
+  return 0;
+}
+
+int DBObjectMap::list_object_headers(vector<_Header> *out)
+{
+  int error = 0;
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    bufferlist bl = iter->value();
+    auto bliter = bl.cbegin();
+    _Header header;
+    header.decode(bliter);
+    out->push_back(header);
+    while (header.parent) {
+      set<string> to_get;
+      map<string, bufferlist> got;
+      to_get.insert(HEADER_KEY);
+      db->get(sys_parent_prefix(header), to_get, &got);
+      if (got.empty()) {
+	dout(0) << "Missing: seq " << header.parent << dendl;
+	error = -ENOENT;
+	break;
+      } else {
+	bl = got.begin()->second;
+        auto bliter = bl.cbegin();
+        header.decode(bliter);
+        out->push_back(header);
+      }
+    }
+  }
+  return error;
+}
+
+ostream& operator<<(ostream& out, const DBObjectMap::_Header& h)
+{
+  out << "seq=" << h.seq << " parent=" << h.parent 
+      << " num_children=" << h.num_children
+      << " ghobject=" << h.oid;
+  return out;
+}
+
+int DBObjectMap::rename(const ghobject_t &from,
+		       const ghobject_t &to,
+		       const SequencerPosition *spos)
+{
+  if (from == to)
+    return 0;
+
+  MapHeaderLock _l1(this, std::min(from, to));
+  MapHeaderLock _l2(this, std::max(from, to));
+  MapHeaderLock *lsource, *ltarget;
+  if (from > to) {
+    lsource = &_l2;
+    ltarget= &_l1;
+  } else {
+    lsource = &_l1;
+    ltarget= &_l2;
+  }
+
+  KeyValueDB::Transaction t = db->get_transaction();
+  {
+    Header destination = lookup_map_header(*ltarget, to);
+    if (destination) {
+      if (check_spos(to, destination, spos))
+	return 0;
+      destination->num_children--;
+      remove_map_header(*ltarget, to, destination, t);
+      _clear(destination, t);
+    }
+  }
+
+  Header hdr = lookup_map_header(*lsource, from);
+  if (!hdr)
+    return db->submit_transaction(t);
+
+  remove_map_header(*lsource, from, hdr, t);
+  hdr->oid = to;
+  set_map_header(*ltarget, to, *hdr, t);
+
+  return db->submit_transaction(t);
+}
diff --git a/src/os/filestore/DBObjectMap.h b/src/os/filestore/DBObjectMap.h
new file mode 100644
index 00000000..e288df83
--- /dev/null
+++ b/src/os/filestore/DBObjectMap.h
@@ -0,0 +1,585 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef DBOBJECTMAP_DB_H
+#define DBOBJECTMAP_DB_H
+
+#include "include/buffer_fwd.h"
+#include <set>
+#include <map>
+#include <string>
+
+#include <vector>
+#include <boost/scoped_ptr.hpp>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "osd/osd_types.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/simple_cache.hpp"
+#include <boost/optional/optional_io.hpp>
+
+#include "SequencerPosition.h"
+
+/**
+ * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
+ *
+ * Prefix space structure:
+ *
+ * @see complete_prefix
+ * @see user_prefix
+ * @see sys_prefix
+ *
+ * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and
+ *                   corresponding omap header
+ * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
+ *                                  @see State
+ *                                  @see write_state
+ *                                  @see init
+ *                                  @see generate_new_header
+ * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
+ *              : key->value for header->seq
+ * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
+ * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
+ * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
+ *              : USER_HEADER_KEY - omap header for header->seq
+ *              : HEADER_KEY - encoding of header for header->seq
+ *
+ * For each node (represented by a header), we
+ * store three mappings: the key mapping, the complete mapping, and the parent.
+ * The complete mapping (COMPLETE_PREFIX space) is key->key.  Each x->y entry in
+ * this mapping indicates that the key mapping contains all entries on [x,y).
+ * Note, max string is represented by "", so ""->"" indicates that the parent
+ * is unnecessary (@see rm_keys).  When looking up a key not contained in the
+ * the complete set, we have to check the parent if we don't find it in the
+ * key set.  During rm_keys, we copy keys from the parent and update the
+ * complete set to reflect the change @see rm_keys.
+ */
+class DBObjectMap : public ObjectMap {
+public:
+
+  KeyValueDB *get_db() override { return db.get(); }
+
+  /**
+   * Serializes access to next_seq as well as the in_use set
+   */
+  Mutex header_lock;
+  Cond header_cond;
+  Cond map_header_cond;
+
+  /**
+   * Set of headers currently in use
+   */
+  set<uint64_t> in_use;
+  set<ghobject_t> map_header_in_use;
+
+  /**
+   * Takes the map_header_in_use entry in constructor, releases in
+   * destructor
+   */
+  class MapHeaderLock {
+    DBObjectMap *db;
+    boost::optional<ghobject_t> locked;
+
+    MapHeaderLock(const MapHeaderLock &);
+    MapHeaderLock &operator=(const MapHeaderLock &);
+  public:
+    explicit MapHeaderLock(DBObjectMap *db) : db(db) {}
+    MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) {
+      Mutex::Locker l(db->header_lock);
+      while (db->map_header_in_use.count(*locked))
+	db->map_header_cond.Wait(db->header_lock);
+      db->map_header_in_use.insert(*locked);
+    }
+
+    const ghobject_t &get_locked() const {
+      ceph_assert(locked);
+      return *locked;
+    }
+
+    void swap(MapHeaderLock &o) {
+      ceph_assert(db == o.db);
+
+      // centos6's boost optional doesn't seem to have swap :(
+      boost::optional<ghobject_t> _locked = o.locked;
+      o.locked = locked;
+      locked = _locked;
+    }
+
+    ~MapHeaderLock() {
+      if (locked) {
+	Mutex::Locker l(db->header_lock);
+	ceph_assert(db->map_header_in_use.count(*locked));
+	db->map_header_cond.Signal();
+	db->map_header_in_use.erase(*locked);
+      }
+    }
+  };
+
+  DBObjectMap(CephContext* cct, KeyValueDB *db)
+    : ObjectMap(cct, db), header_lock("DBOBjectMap"),
+      cache_lock("DBObjectMap::CacheLock"),
+      caches(cct->_conf->filestore_omap_header_cache_size)
+    {}
+
+  int set_keys(
+    const ghobject_t &oid,
+    const map<string, bufferlist> &set,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int set_header(
+    const ghobject_t &oid,
+    const bufferlist &bl,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int get_header(
+    const ghobject_t &oid,
+    bufferlist *bl
+    ) override;
+
+  int clear(
+    const ghobject_t &oid,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int clear_keys_header(
+    const ghobject_t &oid,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int rm_keys(
+    const ghobject_t &oid,
+    const set<string> &to_clear,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int get(
+    const ghobject_t &oid,
+    bufferlist *header,
+    map<string, bufferlist> *out
+    ) override;
+
+  int get_keys(
+    const ghobject_t &oid,
+    set<string> *keys
+    ) override;
+
+  int get_values(
+    const ghobject_t &oid,
+    const set<string> &keys,
+    map<string, bufferlist> *out
+    ) override;
+
+  int check_keys(
+    const ghobject_t &oid,
+    const set<string> &keys,
+    set<string> *out
+    ) override;
+
+  int get_xattrs(
+    const ghobject_t &oid,
+    const set<string> &to_get,
+    map<string, bufferlist> *out
+    ) override;
+
+  int get_all_xattrs(
+    const ghobject_t &oid,
+    set<string> *out
+    ) override;
+
+  int set_xattrs(
+    const ghobject_t &oid,
+    const map<string, bufferlist> &to_set,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int remove_xattrs(
+    const ghobject_t &oid,
+    const set<string> &to_remove,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int clone(
+    const ghobject_t &oid,
+    const ghobject_t &target,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int rename(
+    const ghobject_t &from,
+    const ghobject_t &to,
+    const SequencerPosition *spos=0
+    );
+
+  int legacy_clone(
+    const ghobject_t &oid,
+    const ghobject_t &target,
+    const SequencerPosition *spos=0
+    );
+
+  /// Read initial state from backing store
+  int get_state();
+  /// Write current state settings to DB
+  void set_state();
+  /// Read initial state and upgrade or initialize state
+  int init(bool upgrade = false);
+
+  /// Upgrade store to current version
+  int upgrade_to_v2();
+
+  /// Consistency check, debug, there must be no parallel writes
+  int check(std::ostream &out, bool repair = false, bool force = false) override;
+
+  /// Ensure that all previous operations are durable
+  int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
+
+  void compact() override {
+    ceph_assert(db);
+    db->compact();
+  }
+
+  /// Util, get all objects, there must be no other concurrent access
+  int list_objects(vector<ghobject_t> *objs ///< [out] objects
+    );
+
+  struct _Header;
+  // Util, get all object headers, there must be no other concurrent access
+  int list_object_headers(vector<_Header> *out ///< [out] headers
+    );
+
+  ObjectMapIterator get_iterator(const ghobject_t &oid) override;
+
+  static const string USER_PREFIX;
+  static const string XATTR_PREFIX;
+  static const string SYS_PREFIX;
+  static const string COMPLETE_PREFIX;
+  static const string HEADER_KEY;
+  static const string USER_HEADER_KEY;
+  static const string GLOBAL_STATE_KEY;
+  static const string HOBJECT_TO_SEQ;
+
+  /// Legacy
+  static const string LEAF_PREFIX;
+  static const string REVERSE_LEAF_PREFIX;
+
+  /// persistent state for store @see generate_header
+  struct State {
+    static const __u8 CUR_VERSION = 3;
+    __u8 v;
+    uint64_t seq;
+    // legacy is false when complete regions never used
+    bool legacy;
+    State() : v(0), seq(1), legacy(false) {}
+    explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {}
+
+    void encode(bufferlist &bl) const {
+      ENCODE_START(3, 1, bl);
+      encode(v, bl);
+      encode(seq, bl);
+      encode(legacy, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator &bl) {
+      DECODE_START(3, bl);
+      if (struct_v >= 2)
+	decode(v, bl);
+      else
+	v = 0;
+      decode(seq, bl);
+      if (struct_v >= 3)
+	decode(legacy, bl);
+      else
+	legacy = false;
+      DECODE_FINISH(bl);
+    }
+
+    void dump(Formatter *f) const {
+      f->dump_unsigned("v", v);
+      f->dump_unsigned("seq", seq);
+      f->dump_bool("legacy", legacy);
+    }
+
+    static void generate_test_instances(list<State*> &o) {
+      o.push_back(new State(0));
+      o.push_back(new State(20));
+    }
+  } state;
+
+  struct _Header {
+    uint64_t seq;
+    uint64_t parent;
+    uint64_t num_children;
+
+    ghobject_t oid;
+
+    SequencerPosition spos;
+
+    void encode(bufferlist &bl) const {
+      coll_t unused;
+      ENCODE_START(2, 1, bl);
+      encode(seq, bl);
+      encode(parent, bl);
+      encode(num_children, bl);
+      encode(unused, bl);
+      encode(oid, bl);
+      encode(spos, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator &bl) {
+      coll_t unused;
+      DECODE_START(2, bl);
+      decode(seq, bl);
+      decode(parent, bl);
+      decode(num_children, bl);
+      decode(unused, bl);
+      decode(oid, bl);
+      if (struct_v >= 2)
+	decode(spos, bl);
+      DECODE_FINISH(bl);
+    }
+
+    void dump(Formatter *f) const {
+      f->dump_unsigned("seq", seq);
+      f->dump_unsigned("parent", parent);
+      f->dump_unsigned("num_children", num_children);
+      f->dump_stream("oid") << oid;
+    }
+
+    static void generate_test_instances(list<_Header*> &o) {
+      o.push_back(new _Header);
+      o.push_back(new _Header);
+      o.back()->parent = 20;
+      o.back()->seq = 30;
+    }
+
+    size_t length() {
+      return sizeof(_Header);
+    }
+
+    _Header() : seq(0), parent(0), num_children(1) {}
+  };
+
+  /// String munging (public for testing)
+  static string ghobject_key(const ghobject_t &oid);
+  static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
+  static int is_buggy_ghobject_key_v1(CephContext* cct,
+				      const string &in);
+private:
+  /// Implicit lock on Header->seq
+  typedef std::shared_ptr<_Header> Header;
+  Mutex cache_lock;
+  SimpleLRU<ghobject_t, _Header> caches;
+
+  string map_header_key(const ghobject_t &oid);
+  string header_key(uint64_t seq);
+  string complete_prefix(Header header);
+  string user_prefix(Header header);
+  string sys_prefix(Header header);
+  string xattr_prefix(Header header);
+  string sys_parent_prefix(_Header header);
+  string sys_parent_prefix(Header header) {
+    return sys_parent_prefix(*header);
+  }
+
+  class EmptyIteratorImpl : public ObjectMapIteratorImpl {
+  public:
+    int seek_to_first() override { return 0; }
+    int seek_to_last() { return 0; }
+    int upper_bound(const string &after) override { return 0; }
+    int lower_bound(const string &to) override { return 0; }
+    bool valid() override { return false; }
+    int next() override { ceph_abort(); return 0; }
+    string key() override { ceph_abort(); return ""; }
+    bufferlist value() override { ceph_abort(); return bufferlist(); }
+    int status() override { return 0; }
+  };
+
+
+  /// Iterator
+  class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl {
+  public:
+    DBObjectMap *map;
+
+    /// NOTE: implicit lock hlock->get_locked() when returned out of the class
+    MapHeaderLock hlock;
+    /// NOTE: implicit lock on header->seq AND for all ancestors
+    Header header;
+
+    /// parent_iter == NULL iff no parent
+    std::shared_ptr<DBObjectMapIteratorImpl> parent_iter;
+    KeyValueDB::Iterator key_iter;
+    KeyValueDB::Iterator complete_iter;
+
+    /// cur_iter points to currently valid iterator
+    std::shared_ptr<ObjectMapIteratorImpl> cur_iter;
+    int r;
+
+    /// init() called, key_iter, complete_iter, parent_iter filled in
+    bool ready;
+    /// past end
+    bool invalid;
+
+    DBObjectMapIteratorImpl(DBObjectMap *map, Header header) :
+      map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {}
+    int seek_to_first() override;
+    int seek_to_last();
+    int upper_bound(const string &after) override;
+    int lower_bound(const string &to) override;
+    bool valid() override;
+    int next() override;
+    string key() override;
+    bufferlist value() override;
+    int status() override;
+
+    bool on_parent() {
+      return cur_iter == parent_iter;
+    }
+
+    /// skips to next valid parent entry
+    int next_parent();
+    
+    /// first parent() >= to
+    int lower_bound_parent(const string &to);
+
+    /**
+     * Tests whether to_test is in complete region
+     *
+     * postcondition: complete_iter will be max s.t. complete_iter->value > to_test
+     */
+    int in_complete_region(const string &to_test, ///< [in] key to test
+			   string *begin,         ///< [out] beginning of region
+			   string *end            ///< [out] end of region
+      ); ///< @returns true if to_test is in the complete region, else false
+
+  private:
+    int init();
+    bool valid_parent();
+    int adjust();
+  };
+
+  typedef std::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator;
+  DBObjectMapIterator _get_iterator(Header header) {
+    return std::make_shared<DBObjectMapIteratorImpl>(this, header);
+  }
+
+  /// sys
+
+  /// Removes node corresponding to header
+  void clear_header(Header header, KeyValueDB::Transaction t);
+
+  /// Set node containing input to new contents
+  void set_header(Header input, KeyValueDB::Transaction t);
+
+  /// Remove leaf node corresponding to oid in c
+  void remove_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid,
+    Header header,
+    KeyValueDB::Transaction t);
+
+  /// Set leaf node for c and oid to the value of header
+  void set_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid, _Header header,
+    KeyValueDB::Transaction t);
+
+  /// Set leaf node for c and oid to the value of header
+  bool check_spos(const ghobject_t &oid,
+		  Header header,
+		  const SequencerPosition *spos);
+
+  /// Lookup or create header for c oid
+  Header lookup_create_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid,
+    KeyValueDB::Transaction t);
+
+  /**
+   * Generate new header for c oid with new seq number
+   *
+   * Has the side effect of synchronously saving the new DBObjectMap state
+   */
+  Header _generate_new_header(const ghobject_t &oid, Header parent);
+  Header generate_new_header(const ghobject_t &oid, Header parent) {
+    Mutex::Locker l(header_lock);
+    return _generate_new_header(oid, parent);
+  }
+
+  /// Lookup leaf header for c oid
+  Header _lookup_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid);
+  Header lookup_map_header(
+    const MapHeaderLock &l2,
+    const ghobject_t &oid) {
+    Mutex::Locker l(header_lock);
+    return _lookup_map_header(l2, oid);
+  }
+
+  /// Lookup header node for input
+  Header lookup_parent(Header input);
+
+
+  /// Helpers
+  int _get_header(Header header, bufferlist *bl);
+
+  /// Scan keys in header into out_keys and out_values (if nonnull)
+  int scan(Header header,
+	   const set<string> &in_keys,
+	   set<string> *out_keys,
+	   map<string, bufferlist> *out_values);
+
+  /// Remove header and all related prefixes
+  int _clear(Header header,
+	     KeyValueDB::Transaction t);
+
+  /* Scan complete region bumping *begin to the beginning of any
+   * containing region and adding all complete region keys between
+   * the updated begin and end to the complete_keys_to_remove set */
+  int merge_new_complete(DBObjectMapIterator &iter,
+			 string *begin,
+			 const string &end,
+			 set<string> *complete_keys_to_remove);
+
+  /// Writes out State (mainly next_seq)
+  int write_state(KeyValueDB::Transaction _t =
+		  KeyValueDB::Transaction());
+
+  /// Copies header entry from parent @see rm_keys
+  int copy_up_header(Header header,
+		     KeyValueDB::Transaction t);
+
+  /// Sets header @see set_header
+  void _set_header(Header header, const bufferlist &bl,
+		   KeyValueDB::Transaction t);
+
+  /**
+   * Removes header seq lock and possibly object lock
+   * once Header is out of scope
+   * @see lookup_parent
+   * @see generate_new_header
+   */
+  class RemoveOnDelete {
+  public:
+    DBObjectMap *db;
+    explicit RemoveOnDelete(DBObjectMap *db) :
+      db(db) {}
+    void operator() (_Header *header) {
+      Mutex::Locker l(db->header_lock);
+      ceph_assert(db->in_use.count(header->seq));
+      db->in_use.erase(header->seq);
+      db->header_cond.Signal();
+      delete header;
+    }
+  };
+  friend class RemoveOnDelete;
+};
+WRITE_CLASS_ENCODER(DBObjectMap::_Header)
+WRITE_CLASS_ENCODER(DBObjectMap::State)
+
+ostream& operator<<(ostream& out, const DBObjectMap::_Header& h);
+
+#endif
diff --git a/src/os/filestore/FDCache.h b/src/os/filestore/FDCache.h
new file mode 100644
index 00000000..ee8c4fb0
--- /dev/null
+++ b/src/os/filestore/FDCache.h
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_FDCACHE_H
+#define CEPH_FDCACHE_H
+
+#include <memory>
+#include <errno.h>
+#include <cstdio>
+#include "common/config_obs.h"
+#include "common/hobject.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/shared_cache.hpp"
+#include "include/compat.h"
+#include "include/intarith.h"
+
+/**
+ * FD Cache
+ */
+class FDCache : public md_config_obs_t {
+public:
+  /**
+   * FD
+   *
+   * Wrapper for an fd.  Destructor closes the fd.
+   */
+  class FD {
+  public:
+    const int fd;
+    explicit FD(int _fd) : fd(_fd) {
+      ceph_assert(_fd >= 0);
+    }
+    int operator*() const {
+      return fd;
+    }
+    ~FD() {
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+    }
+  };
+
+private:
+  CephContext *cct;
+  const int registry_shards;
+  SharedLRU<ghobject_t, FD> *registry;
+
+public:
+  explicit FDCache(CephContext *cct) : cct(cct),
+  registry_shards(std::max<int64_t>(cct->_conf->filestore_fd_cache_shards, 1)) {
+    ceph_assert(cct);
+    cct->_conf.add_observer(this);
+    registry = new SharedLRU<ghobject_t, FD>[registry_shards];
+    for (int i = 0; i < registry_shards; ++i) {
+      registry[i].set_cct(cct);
+      registry[i].set_size(
+          std::max<int64_t>((cct->_conf->filestore_fd_cache_size / registry_shards), 1));
+    }
+  }
+  ~FDCache() override {
+    cct->_conf.remove_observer(this);
+    delete[] registry;
+  }
+  typedef std::shared_ptr<FD> FDRef;
+
+  FDRef lookup(const ghobject_t &hoid) {
+    int registry_id = hoid.hobj.get_hash() % registry_shards;
+    return registry[registry_id].lookup(hoid);
+  }
+
+  FDRef add(const ghobject_t &hoid, int fd, bool *existed) {
+    int registry_id = hoid.hobj.get_hash() % registry_shards;
+    return registry[registry_id].add(hoid, new FD(fd), existed);
+  }
+
+  /// clear cached fd for hoid, subsequent lookups will get an empty FD
+  void clear(const ghobject_t &hoid) {
+    int registry_id = hoid.hobj.get_hash() % registry_shards;
+    registry[registry_id].purge(hoid);
+  }
+
+  /// md_config_obs_t
+  const char** get_tracked_conf_keys() const override {
+    static const char* KEYS[] = {
+      "filestore_fd_cache_size",
+      NULL
+    };
+    return KEYS;
+  }
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set<std::string> &changed) override {
+    if (changed.count("filestore_fd_cache_size")) {
+      for (int i = 0; i < registry_shards; ++i)
+        registry[i].set_size(
+              std::max<int64_t>((conf->filestore_fd_cache_size / registry_shards), 1));
+    }
+  }
+
+};
+typedef FDCache::FDRef FDRef;
+
+#endif
diff --git a/src/os/filestore/FileJournal.cc b/src/os/filestore/FileJournal.cc
new file mode 100644
index 00000000..f0351fe4
--- /dev/null
+++ b/src/os/filestore/FileJournal.cc
@@ -0,0 +1,2216 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "acconfig.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "FileJournal.h"
+#include "include/color.h"
+#include "common/perf_counters.h"
+#include "FileStore.h"
+
+#include "include/compat.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+
+#include "common/blkdev.h"
+#if defined(__linux__)
+#include "common/linux_version.h"
+#endif
+
+#if defined(__FreeBSD__)
+#define O_DSYNC O_SYNC
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+const static int64_t ONE_MEG(1 << 20);
+const static int CEPH_DIRECTIO_ALIGNMENT(4096);
+
+
+int FileJournal::_open(bool forwrite, bool create)
+{
+  int flags, ret;
+
+  if (forwrite) {
+    flags = O_RDWR;
+    if (directio)
+      flags |= O_DIRECT | O_DSYNC;
+  } else {
+    flags = O_RDONLY;
+  }
+  if (create)
+    flags |= O_CREAT;
+
+  if (fd >= 0) {
+    if (TEMP_FAILURE_RETRY(::close(fd))) {
+      int err = errno;
+      derr << "FileJournal::_open: error closing old fd: "
+	   << cpp_strerror(err) << dendl;
+    }
+  }
+  fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags|O_CLOEXEC, 0644));
+  if (fd < 0) {
+    int err = errno;
+    dout(2) << "FileJournal::_open unable to open journal "
+	    << fn << ": " << cpp_strerror(err) << dendl;
+    return -err;
+  }
+
+  struct stat st;
+  ret = ::fstat(fd, &st);
+  if (ret) {
+    ret = errno;
+    derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;
+    ret = -ret;
+    goto out_fd;
+  }
+
+  if (S_ISBLK(st.st_mode)) {
+    ret = _open_block_device();
+  } else if (S_ISREG(st.st_mode)) {
+    if (aio && !force_aio) {
+      derr << "FileJournal::_open: disabling aio for non-block journal.  Use "
+	   << "journal_force_aio to force use of aio anyway" << dendl;
+      aio = false;
+    }
+    ret = _open_file(st.st_size, st.st_blksize, create);
+  } else {
+    derr << "FileJournal::_open: wrong journal file type: " << st.st_mode
+	 << dendl;
+    ret = -EINVAL;
+  }
+
+  if (ret)
+    goto out_fd;
+
+#ifdef HAVE_LIBAIO
+  if (aio) {
+    aio_ctx = 0;
+    ret = io_setup(128, &aio_ctx);
+    if (ret < 0) {
+      switch (ret) {
+	// Contrary to naive expectations -EAGIAN means ...
+	case -EAGAIN:
+	  derr << "FileJournal::_open: user's limit of aio events exceeded. "
+	       << "Try increasing /proc/sys/fs/aio-max-nr" << dendl;
+	  break;
+	default:
+	  derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl;
+	  break;
+      }
+      goto out_fd;
+    }
+  }
+#endif
+
+  /* We really want max_size to be a multiple of block_size. */
+  max_size -= max_size % block_size;
+
+  dout(1) << "_open " << fn << " fd " << fd
+	  << ": " << max_size
+	  << " bytes, block size " << block_size
+	  << " bytes, directio = " << directio
+	  << ", aio = " << aio
+	  << dendl;
+  return 0;
+
+ out_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  fd = -1;
+  return ret;
+}
+
+int FileJournal::_open_block_device()
+{
+  int64_t bdev_sz = 0;
+  BlkDev blkdev(fd);
+  int ret = blkdev.get_size(&bdev_sz);
+  if (ret) {
+    dout(0) << __func__ << ": failed to read block device size." << dendl;
+    return -EIO;
+  }
+
+  /* Check for bdev_sz too small */
+  if (bdev_sz < ONE_MEG) {
+    dout(0) << __func__ << ": your block device must be at least "
+      << ONE_MEG << " bytes to be used for a Ceph journal." << dendl;
+    return -EINVAL;
+  }
+
+  dout(10) << __func__ << ": ignoring osd journal size. "
+	   << "We'll use the entire block device (size: " << bdev_sz << ")"
+	   << dendl;
+  max_size = bdev_sz;
+
+  block_size = cct->_conf->journal_block_size;
+
+  if (cct->_conf->journal_discard) {
+    discard = blkdev.support_discard();
+    dout(10) << fn << " support discard: " << (int)discard << dendl;
+  }
+
+  return 0;
+}
+
+int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
+			    bool create)
+{
+  int ret;
+  int64_t conf_journal_sz(cct->_conf->osd_journal_size);
+  conf_journal_sz <<= 20;
+
+  if ((cct->_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) {
+    derr << "I'm sorry, I don't know how large of a journal to create."
+	 << "Please specify a block device to use as the journal OR "
+	 << "set osd_journal_size in your ceph.conf" << dendl;
+    return -EINVAL;
+  }
+
+  if (create && (oldsize < conf_journal_sz)) {
+    uint64_t newsize(conf_journal_sz);
+    dout(10) <<  __func__ << " _open extending to " << newsize << " bytes" << dendl;
+    ret = ::ftruncate(fd, newsize);
+    if (ret < 0) {
+      int err = errno;
+      derr << "FileJournal::_open_file : unable to extend journal to "
+	   << newsize << " bytes: " << cpp_strerror(err) << dendl;
+      return -err;
+    }
+    ret = ceph_posix_fallocate(fd, 0, newsize);
+    if (ret) {
+      derr << "FileJournal::_open_file : unable to preallocation journal to "
+	   << newsize << " bytes: " << cpp_strerror(ret) << dendl;
+      return -ret;
+    }
+    max_size = newsize;
+  }
+  else {
+    max_size = oldsize;
+  }
+  block_size = cct->_conf->journal_block_size;
+
+  if (create && cct->_conf->journal_zero_on_create) {
+    derr << "FileJournal::_open_file : zeroing journal" << dendl;
+    uint64_t write_size = 1 << 20;
+    char *buf;
+    ret = ::posix_memalign((void **)&buf, block_size, write_size);
+    if (ret != 0) {
+      return -ret;
+    }
+    memset(static_cast<void*>(buf), 0, write_size);
+    uint64_t i = 0;
+    for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
+      ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
+      if (ret < 0) {
+	free(buf);
+	return -errno;
+      }
+    }
+    if (i < (uint64_t)max_size) {
+      ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
+      if (ret < 0) {
+	free(buf);
+	return -errno;
+      }
+    }
+    free(buf);
+  }
+
+
+  dout(10) << "_open journal is not a block device, NOT checking disk "
+           << "write cache on '" << fn << "'" << dendl;
+
+  return 0;
+}
+
+// This can not be used on an active journal
+int FileJournal::check()
+{
+  int ret;
+
+  ceph_assert(fd == -1);
+  ret = _open(false, false);
+  if (ret)
+    return ret;
+
+  ret = read_header(&header);
+  if (ret < 0)
+    goto done;
+
+  if (header.fsid != fsid) {
+    derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+	 << ", invalid (someone else's?) journal" << dendl;
+    ret = -EINVAL;
+    goto done;
+  }
+
+  dout(1) << "check: header looks ok" << dendl;
+  ret = 0;
+
+ done:
+  close();
+  return ret;
+}
+
+
+int FileJournal::create()
+{
+  void *buf = 0;
+  int64_t needed_space;
+  int ret;
+  buffer::ptr bp;
+  dout(2) << "create " << fn << " fsid " << fsid << dendl;
+
+  ret = _open(true, true);
+  if (ret)
+    goto done;
+
+  // write empty header
+  header = header_t();
+  header.flags = header_t::FLAG_CRC;  // enable crcs on any new journal.
+  header.fsid = fsid;
+  header.max_size = max_size;
+  header.block_size = block_size;
+  if (cct->_conf->journal_block_align || directio)
+    header.alignment = block_size;
+  else
+    header.alignment = 16;  // at least stay word aligned on 64bit machines...
+
+  header.start = get_top();
+  header.start_seq = 0;
+
+  print_header(header);
+
+  // static zeroed buffer for alignment padding
+  delete [] zero_buf;
+  zero_buf = new char[header.alignment];
+  memset(zero_buf, 0, header.alignment);
+
+  bp = prepare_header();
+  if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) {
+    ret = -errno;
+    derr << "FileJournal::create : create write header error "
+         << cpp_strerror(ret) << dendl;
+    goto close_fd;
+  }
+
+  // zero first little bit, too.
+  ret = posix_memalign(&buf, block_size, block_size);
+  if (ret) {
+    ret = -ret;
+    derr << "FileJournal::create: failed to allocate " << block_size
+	 << " bytes of memory: " << cpp_strerror(ret) << dendl;
+    goto close_fd;
+  }
+  memset(buf, 0, block_size);
+  if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) {
+    ret = -errno;
+    derr << "FileJournal::create: error zeroing first " << block_size
+	 << " bytes " << cpp_strerror(ret) << dendl;
+    goto free_buf;
+  }
+
+  needed_space = cct->_conf->osd_max_write_size << 20;
+  needed_space += (2 * sizeof(entry_header_t)) + get_top();
+  if (header.max_size - header.start < needed_space) {
+    derr << "FileJournal::create: OSD journal is not large enough to hold "
+	 << "osd_max_write_size bytes!" << dendl;
+    ret = -ENOSPC;
+    goto free_buf;
+  }
+
+  dout(2) << "create done" << dendl;
+  ret = 0;
+
+free_buf:
+  free(buf);
+  buf = 0;
+close_fd:
+  if (TEMP_FAILURE_RETRY(::close(fd)) < 0) {
+    ret = -errno;
+    derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret)
+	 << dendl;
+  }
+done:
+  fd = -1;
+  return ret;
+}
+
+// This can not be used on an active journal
+int FileJournal::peek_fsid(uuid_d& fsid)
+{
+  ceph_assert(fd == -1);
+  int r = _open(false, false);
+  if (r)
+    return r;
+  r = read_header(&header);
+  if (r < 0)
+    goto out;
+  fsid = header.fsid;
+out:
+  close();
+  return r;
+}
+
+int FileJournal::open(uint64_t fs_op_seq)
+{
+  dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl;
+
+  uint64_t next_seq = fs_op_seq + 1;
+  uint64_t seq = -1;
+
+  int err = _open(false);
+  if (err)
+    return err;
+
+  // assume writeable, unless...
+  read_pos = 0;
+  write_pos = get_top();
+
+  // read header?
+  err = read_header(&header);
+  if (err < 0)
+    goto out;
+
+  // static zeroed buffer for alignment padding
+  delete [] zero_buf;
+  zero_buf = new char[header.alignment];
+  memset(zero_buf, 0, header.alignment);
+
+  dout(10) << "open header.fsid = " << header.fsid
+    //<< " vs expected fsid = " << fsid
+	   << dendl;
+  if (header.fsid != fsid) {
+    derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+         << ", invalid (someone else's?) journal" << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if (header.max_size > max_size) {
+    dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if (header.block_size != block_size) {
+    dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if (header.max_size % header.block_size) {
+    dout(2) << "open journal max size " << header.max_size
+	    << " not a multiple of block size " << header.block_size << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if (header.alignment != block_size && directio) {
+    dout(0) << "open journal alignment " << header.alignment << " does not match block size "
+	    << block_size << " (required for direct_io journal mode)" << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if ((header.alignment % CEPH_DIRECTIO_ALIGNMENT) && directio) {
+    dout(0) << "open journal alignment " << header.alignment
+	    << " is not multiple of minimum directio alignment "
+	    << CEPH_DIRECTIO_ALIGNMENT << " (required for direct_io journal mode)"
+	    << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+
+  // looks like a valid header.
+  write_pos = 0;  // not writeable yet
+
+  journaled_seq = header.committed_up_to;
+
+  // find next entry
+  read_pos = header.start;
+  seq = header.start_seq;
+
+  while (1) {
+    bufferlist bl;
+    off64_t old_pos = read_pos;
+    if (!read_entry(bl, seq)) {
+      dout(10) << "open reached end of journal." << dendl;
+      break;
+    }
+    if (seq > next_seq) {
+      dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq
+	       << ", ignoring journal contents"
+	       << dendl;
+      read_pos = -1;
+      last_committed_seq = 0;
+      return 0;
+    }
+    if (seq == next_seq) {
+      dout(10) << "open reached seq " << seq << dendl;
+      read_pos = old_pos;
+      break;
+    }
+    seq++;  // next event should follow.
+  }
+
+  return 0;
+out:
+  close();
+  return err;
+}
+
+void FileJournal::_close(int fd) const
+{
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileJournal::close()
+{
+  dout(1) << "close " << fn << dendl;
+
+  // stop writer thread
+  stop_writer();
+
+  // close
+  ceph_assert(writeq_empty());
+  ceph_assert(!must_write_header);
+  ceph_assert(fd >= 0);
+  _close(fd);
+  fd = -1;
+}
+
+
+int FileJournal::dump(ostream& out)
+{
+  return _dump(out, false);
+}
+
+int FileJournal::simple_dump(ostream& out)
+{
+  return _dump(out, true);
+}
+
+int FileJournal::_dump(ostream& out, bool simple)
+{
+  JSONFormatter f(true);
+  int ret = _fdump(f, simple);
+  f.flush(out);
+  return ret;
+}
+
+int FileJournal::_fdump(Formatter &f, bool simple)
+{
+  dout(10) << "_fdump" << dendl;
+
+  ceph_assert(fd == -1);
+  int err = _open(false, false);
+  if (err)
+    return err;
+
+  err = read_header(&header);
+  if (err < 0) {
+    close();
+    return err;
+  }
+
+  off64_t next_pos = header.start;
+
+  f.open_object_section("journal");
+
+  f.open_object_section("header");
+  f.dump_unsigned("flags", header.flags);
+  ostringstream os;
+  os << header.fsid;
+  f.dump_string("fsid", os.str());
+  f.dump_unsigned("block_size", header.block_size);
+  f.dump_unsigned("alignment", header.alignment);
+  f.dump_int("max_size", header.max_size);
+  f.dump_int("start", header.start);
+  f.dump_unsigned("committed_up_to", header.committed_up_to);
+  f.dump_unsigned("start_seq", header.start_seq);
+  f.close_section();
+
+  f.open_array_section("entries");
+  uint64_t seq = header.start_seq;
+  while (1) {
+    bufferlist bl;
+    off64_t pos = next_pos;
+
+    if (!pos) {
+      dout(2) << "_dump -- not readable" << dendl;
+      err = -EINVAL;
+      break;
+    }
+    stringstream ss;
+    read_entry_result result = do_read_entry(
+      pos,
+      &next_pos,
+      &bl,
+      &seq,
+      &ss);
+    if (result != SUCCESS) {
+      if (seq < header.committed_up_to) {
+        dout(2) << "Unable to read past sequence " << seq
+	    << " but header indicates the journal has committed up through "
+	    << header.committed_up_to << ", journal is corrupt" << dendl;
+        err = -EINVAL;
+      }
+      dout(25) << ss.str() << dendl;
+      dout(25) << "No further valid entries found, journal is most likely valid"
+	  << dendl;
+      break;
+    }
+
+    f.open_object_section("entry");
+    f.dump_unsigned("offset", pos);
+    f.dump_unsigned("seq", seq);
+    if (simple) {
+      f.dump_unsigned("bl.length", bl.length());
+    } else {
+      f.open_array_section("transactions");
+      auto p = bl.cbegin();
+      int trans_num = 0;
+      while (!p.end()) {
+        ObjectStore::Transaction t(p);
+        f.open_object_section("transaction");
+        f.dump_unsigned("trans_num", trans_num);
+        t.dump(&f);
+        f.close_section();
+        trans_num++;
+      }
+      f.close_section();
+    }
+    f.close_section();
+  }
+
+  f.close_section();
+  f.close_section();
+  dout(10) << "dump finish" << dendl;
+
+  close();
+  return err;
+}
+
+
+void FileJournal::start_writer()
+{
+  write_stop = false;
+  aio_stop = false;
+  write_thread.create("journal_write");
+#ifdef HAVE_LIBAIO
+  if (aio)
+    write_finish_thread.create("journal_wrt_fin");
+#endif
+}
+
+void FileJournal::stop_writer()
+{
+  // Do nothing if writer already stopped or never started
+  if (!write_stop)
+  {
+    {
+      Mutex::Locker l(write_lock);
+      Mutex::Locker p(writeq_lock);
+      write_stop = true;
+      writeq_cond.Signal();
+      // Doesn't hurt to signal commit_cond in case thread is waiting there
+      // and caller didn't use committed_thru() first.
+      commit_cond.Signal();
+    }
+    write_thread.join();
+
+    // write journal header now so that we have less to replay on remount
+    write_header_sync();
+  }
+
+#ifdef HAVE_LIBAIO
+  // stop aio completeion thread *after* writer thread has stopped
+  // and has submitted all of its io
+  if (aio && !aio_stop) {
+    aio_lock.Lock();
+    aio_stop = true;
+    aio_cond.Signal();
+    write_finish_cond.Signal();
+    aio_lock.Unlock();
+    write_finish_thread.join();
+  }
+#endif
+}
+
+
+
+void FileJournal::print_header(const header_t &header) const
+{
+  dout(10) << "header: block_size " << header.block_size
+	   << " alignment " << header.alignment
+	   << " max_size " << header.max_size
+	   << dendl;
+  dout(10) << "header: start " << header.start << dendl;
+  dout(10) << " write_pos " << write_pos << dendl;
+}
+
+int FileJournal::read_header(header_t *hdr) const
+{
+  dout(10) << "read_header" << dendl;
+  bufferlist bl;
+
+  buffer::ptr bp = buffer::create_small_page_aligned(block_size);
+  char* bpdata = bp.c_str();
+  int r = ::pread(fd, bpdata, bp.length(), 0);
+
+  if (r < 0) {
+    int err = errno;
+    dout(0) << "read_header got " << cpp_strerror(err) << dendl;
+    return -err;
+  }
+
+  // don't use bp.zero() here, because it also invalidates
+  // crc cache (which is not yet populated anyway)
+  if (bp.length() != (size_t)r) {
+      // r will be always less or equal than bp.length
+      bpdata += r;
+      memset(bpdata, 0, bp.length() - r);
+  }
+
+  bl.push_back(std::move(bp));
+
+  try {
+    auto p = bl.cbegin();
+    decode(*hdr, p);
+  }
+  catch (buffer::error& e) {
+    derr << "read_header error decoding journal header" << dendl;
+    return -EINVAL;
+  }
+
+
+  /*
+   * Unfortunately we weren't initializing the flags field for new
+   * journals!  Aie.  This is safe(ish) now that we have only one
+   * flag.  Probably around when we add the next flag we need to
+   * remove this or else this (eventually old) code will clobber newer
+   * code's flags.
+   */
+  if (hdr->flags > 3) {
+    derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
+    hdr->flags = 0;
+  }
+
+  print_header(*hdr);
+
+  return 0;
+}
+
+bufferptr FileJournal::prepare_header()
+{
+  bufferlist bl;
+  {
+    Mutex::Locker l(finisher_lock);
+    header.committed_up_to = journaled_seq;
+  }
+  encode(header, bl);
+  bufferptr bp = buffer::create_small_page_aligned(get_top());
+  // don't use bp.zero() here, because it also invalidates
+  // crc cache (which is not yet populated anyway)
+  char* data = bp.c_str();
+  memcpy(data, bl.c_str(), bl.length());
+  data += bl.length();
+  memset(data, 0, bp.length()-bl.length());
+  return bp;
+}
+
+void FileJournal::write_header_sync()
+{
+  Mutex::Locker locker(write_lock);
+  must_write_header = true;
+  bufferlist bl;
+  do_write(bl);
+  dout(20) << __func__ << " finish" << dendl;
+}
+
+int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size)
+{
+  // already full?
+  if (full_state != FULL_NOTFULL)
+    return -ENOSPC;
+
+  // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL.
+  off64_t room;
+  if (pos >= header.start)
+    room = (header.max_size - pos) + (header.start - get_top()) - 1;
+  else
+    room = header.start - pos - 1;
+  dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start
+	   << " top " << get_top() << dendl;
+
+  if (do_sync_cond) {
+    if (room >= (header.max_size >> 1) &&
+        room - size < (header.max_size >> 1)) {
+      dout(10) << " passing half full mark, triggering commit" << dendl;
+      do_sync_cond->SloppySignal();  // initiate a real commit so we can trim
+    }
+  }
+
+  if (room >= size) {
+    dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl;
+    if (pos + size > header.max_size)
+      must_write_header = true;
+    return 0;
+  }
+
+  // full
+  dout(1) << "check_for_full at " << pos << " : JOURNAL FULL "
+	  << pos << " >= " << room
+	  << " (max_size " << header.max_size << " start " << header.start << ")"
+	  << dendl;
+
+  off64_t max = header.max_size - get_top();
+  if (size > max)
+    dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl;
+
+  return -ENOSPC;
+}
+
+int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+  // gather queued writes
+  off64_t queue_pos = write_pos;
+
+  int eleft = cct->_conf->journal_max_write_entries;
+  unsigned bmax = cct->_conf->journal_max_write_bytes;
+
+  if (full_state != FULL_NOTFULL)
+    return -ENOSPC;
+
+  while (!writeq_empty()) {
+    list<write_item> items;
+    batch_pop_write(items);
+    list<write_item>::iterator it = items.begin();
+    while (it != items.end()) {
+      uint64_t bytes = it->bl.length();
+      int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes);
+      if (r == 0) { // prepare ok, delete it
+	items.erase(it++);
+#ifdef HAVE_LIBAIO
+	{
+	  Mutex::Locker locker(aio_lock);
+	  ceph_assert(aio_write_queue_ops > 0);
+	  aio_write_queue_ops--;
+	  ceph_assert(aio_write_queue_bytes >= bytes);
+	  aio_write_queue_bytes -= bytes;
+	}
+#else
+	(void)bytes;
+#endif
+      }
+      if (r == -ENOSPC) {
+        // the journal maybe full, insert the left item to writeq
+        batch_unpop_write(items);
+        if (orig_ops)
+          goto out;         // commit what we have
+
+        if (logger)
+          logger->inc(l_filestore_journal_full);
+
+        if (wait_on_full) {
+          dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl;
+        } else {
+          dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl;
+
+          // throw out what we have so far
+          full_state = FULL_FULL;
+          while (!writeq_empty()) {
+            complete_write(1, peek_write().orig_len);
+            pop_write();
+          }
+          print_header(header);
+        }
+
+        return -ENOSPC;  // hrm, full on first op
+      }
+      if (eleft) {
+        if (--eleft == 0) {
+          dout(20) << "prepare_multi_write hit max events per write "
+		   << cct->_conf->journal_max_write_entries << dendl;
+          batch_unpop_write(items);
+          goto out;
+        }
+      }
+      if (bmax) {
+        if (bl.length() >= bmax) {
+          dout(20) << "prepare_multi_write hit max write size "
+		   << cct->_conf->journal_max_write_bytes << dendl;
+          batch_unpop_write(items);
+          goto out;
+        }
+      }
+    }
+  }
+
+out:
+  dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl;
+  ceph_assert((write_pos + bl.length() == queue_pos) ||
+         (write_pos + bl.length() - header.max_size + get_top() == queue_pos));
+  return 0;
+}
+
+/*
+void FileJournal::queue_write_fin(uint64_t seq, Context *fin)
+{
+  writing_seq.push_back(seq);
+  if (!waiting_for_notfull.empty()) {
+    // make sure previously unjournaled stuff waiting for UNFULL triggers
+    // _before_ newly journaled stuff does
+    dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin
+	     << " until after UNFULL" << dendl;
+    C_Gather *g = new C_Gather(writeq.front().fin);
+    writing_fin.push_back(g->new_sub());
+    waiting_for_notfull.push_back(g->new_sub());
+  } else {
+    writing_fin.push_back(writeq.front().fin);
+    dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl;
+  }
+}
+*/
+
+void FileJournal::queue_completions_thru(uint64_t seq)
+{
+  ceph_assert(finisher_lock.is_locked());
+  utime_t now = ceph_clock_now();
+  list<completion_item> items;
+  batch_pop_completions(items);
+  list<completion_item>::iterator it = items.begin();
+  while (it != items.end()) {
+    completion_item& next = *it;
+    if (next.seq > seq)
+      break;
+    utime_t lat = now;
+    lat -= next.start;
+    dout(10) << "queue_completions_thru seq " << seq
+	     << " queueing seq " << next.seq
+	     << " " << next.finish
+	     << " lat " << lat << dendl;
+    if (logger) {
+      logger->tinc(l_filestore_journal_latency, lat);
+    }
+    if (next.finish)
+      finisher->queue(next.finish);
+    if (next.tracked_op) {
+      next.tracked_op->mark_event("journaled_completion_queued");
+      next.tracked_op->journal_trace.event("queued completion");
+      next.tracked_op->journal_trace.keyval("completed through", seq);
+    }
+    items.erase(it++);
+  }
+  batch_unpop_completions(items);
+  finisher_cond.Signal();
+}
+
+
+int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+  uint64_t seq = next_write.seq;
+  bufferlist &ebl = next_write.bl;
+  off64_t size = ebl.length();
+
+  int r = check_for_full(seq, queue_pos, size);
+  if (r < 0)
+    return r;   // ENOSPC or EAGAIN
+
+  uint32_t orig_len = next_write.orig_len;
+  orig_bytes += orig_len;
+  orig_ops++;
+
+  // add to write buffer
+  dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq
+	   << " len " << orig_len << " -> " << size << dendl;
+
+  unsigned seq_offset = offsetof(entry_header_t, seq);
+  unsigned magic1_offset = offsetof(entry_header_t, magic1);
+  unsigned magic2_offset = offsetof(entry_header_t, magic2);
+
+  bufferptr headerptr = ebl.buffers().front();
+  uint64_t _seq = seq;
+  uint64_t _queue_pos = queue_pos;
+  uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64());
+  headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq);
+  headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+  headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+  bufferptr footerptr = ebl.buffers().back();
+  unsigned post_offset  = footerptr.length() - sizeof(entry_header_t);
+  footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq);
+  footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+  footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+  bl.claim_append(ebl);
+  if (next_write.tracked_op) {
+    next_write.tracked_op->mark_event("write_thread_in_journal_buffer");
+    next_write.tracked_op->journal_trace.event("prepare_single_write");
+  }
+
+  journalq.push_back(pair<uint64_t,off64_t>(seq, queue_pos));
+  writing_seq = seq;
+
+  queue_pos += size;
+  if (queue_pos >= header.max_size)
+    queue_pos = queue_pos + get_top() - header.max_size;
+
+  return 0;
+}
+
+void FileJournal::check_align(off64_t pos, bufferlist& bl)
+{
+  // make sure list segments are page aligned
+  if (directio && !bl.is_aligned_size_and_memory(block_size, CEPH_DIRECTIO_ALIGNMENT)) {
+    ceph_assert((bl.length() & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0);
+    ceph_assert((pos & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0);
+    ceph_abort_msg("bl was not aligned");
+  }
+}
+
+int FileJournal::write_bl(off64_t& pos, bufferlist& bl)
+{
+  int ret;
+
+  off64_t spos = ::lseek64(fd, pos, SEEK_SET);
+  if (spos < 0) {
+    ret = -errno;
+    derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  ret = bl.write_fd(fd);
+  if (ret) {
+    derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  pos += bl.length();
+  if (pos == header.max_size)
+    pos = get_top();
+  return 0;
+}
+
+void FileJournal::do_write(bufferlist& bl)
+{
+  // nothing to do?
+  if (bl.length() == 0 && !must_write_header)
+    return;
+
+  buffer::ptr hbp;
+  if (cct->_conf->journal_write_header_frequency &&
+      (((++journaled_since_start) %
+	cct->_conf->journal_write_header_frequency) == 0)) {
+    must_write_header = true;
+  }
+
+  if (must_write_header) {
+    must_write_header = false;
+    hbp = prepare_header();
+  }
+
+  dout(15) << "do_write writing " << write_pos << "~" << bl.length()
+	   << (hbp.length() ? " + header":"")
+	   << dendl;
+
+  utime_t from = ceph_clock_now();
+
+  // entry
+  off64_t pos = write_pos;
+
+  // Adjust write_pos
+  write_pos += bl.length();
+  if (write_pos >= header.max_size)
+    write_pos = write_pos - header.max_size + get_top();
+
+  write_lock.Unlock();
+
+  // split?
+  off64_t split = 0;
+  if (pos + bl.length() > header.max_size) {
+    bufferlist first, second;
+    split = header.max_size - pos;
+    first.substr_of(bl, 0, split);
+    second.substr_of(bl, split, bl.length() - split);
+    ceph_assert(first.length() + second.length() == bl.length());
+    dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length()
+	     << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl;
+
+    //Save pos to write first piece second
+    off64_t first_pos = pos;
+    off64_t orig_pos;
+    pos = get_top();
+    // header too?
+    if (hbp.length()) {
+      // be sneaky: include the header in the second fragment
+      bufferlist tmp;
+      tmp.push_back(hbp);
+      tmp.claim_append(second);
+      second.swap(tmp);
+      pos = 0;          // we included the header
+    }
+    // Write the second portion first possible with the header, so
+    // do_read_entry() won't even get a valid entry_header_t if there
+    // is a crash between the two writes.
+    orig_pos = pos;
+    if (write_bl(pos, second)) {
+      derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+	   << ") failed" << dendl;
+      check_align(pos, second);
+      ceph_abort();
+    }
+    orig_pos = first_pos;
+    if (write_bl(first_pos, first)) {
+      derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+	   << ") failed" << dendl;
+      check_align(first_pos, first);
+      ceph_abort();
+    }
+    ceph_assert(first_pos == get_top());
+  } else {
+    // header too?
+    if (hbp.length()) {
+      if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) {
+	int err = errno;
+	derr << "FileJournal::do_write: pwrite(fd=" << fd
+	     << ", hbp.length=" << hbp.length() << ") failed :"
+	     << cpp_strerror(err) << dendl;
+	ceph_abort();
+      }
+    }
+
+    if (write_bl(pos, bl)) {
+      derr << "FileJournal::do_write: write_bl(pos=" << pos
+	   << ") failed" << dendl;
+      check_align(pos, bl);
+      ceph_abort();
+    }
+  }
+
+  if (!directio) {
+    dout(20) << "do_write fsync" << dendl;
+
+    /*
+     * We'd really love to have a fsync_range or fdatasync_range and do a:
+     *
+     *  if (split) {
+     *    ::fsync_range(fd, header.max_size - split, split)l
+     *    ::fsync_range(fd, get_top(), bl.length() - split);
+     *  else
+     *    ::fsync_range(fd, write_pos, bl.length())
+     *
+     * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be
+     * too hard given all the underlying infrastructure already exist.
+     *
+     * NOTE: using sync_file_range here would not be safe as it does not
+     * flush disk caches or commits any sort of metadata.
+     */
+    int ret = 0;
+#if defined(__APPLE__) || defined(__FreeBSD__)
+    ret = ::fsync(fd);
+#else
+    ret = ::fdatasync(fd);
+#endif
+    if (ret < 0) {
+      derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+#ifdef HAVE_POSIX_FADVISE
+    if (cct->_conf->filestore_fadvise)
+      posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+  }
+
+  utime_t lat = ceph_clock_now() - from;
+  dout(20) << "do_write latency " << lat << dendl;
+
+  write_lock.Lock();
+
+  ceph_assert(write_pos == pos);
+  ceph_assert(write_pos % header.alignment == 0);
+
+  {
+    Mutex::Locker locker(finisher_lock);
+    journaled_seq = writing_seq;
+
+    // kick finisher?
+    //  only if we haven't filled up recently!
+    if (full_state != FULL_NOTFULL) {
+      dout(10) << "do_write NOT queueing finisher seq " << journaled_seq
+	       << ", full_commit_seq|full_restart_seq" << dendl;
+    } else {
+      if (plug_journal_completions) {
+	dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq
+		 << " due to completion plug" << dendl;
+      } else {
+	dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl;
+	queue_completions_thru(journaled_seq);
+      }
+    }
+  }
+}
+
+void FileJournal::flush()
+{
+  dout(10) << "waiting for completions to empty" << dendl;
+  {
+    Mutex::Locker l(finisher_lock);
+    while (!completions_empty())
+      finisher_cond.Wait(finisher_lock);
+  }
+  dout(10) << "flush waiting for finisher" << dendl;
+  finisher->wait_for_empty();
+  dout(10) << "flush done" << dendl;
+}
+
+
+void FileJournal::write_thread_entry()
+{
+  dout(10) << "write_thread_entry start" << dendl;
+  while (1) {
+    {
+      Mutex::Locker locker(writeq_lock);
+      if (writeq.empty() && !must_write_header) {
+	if (write_stop)
+	  break;
+	dout(20) << "write_thread_entry going to sleep" << dendl;
+	writeq_cond.Wait(writeq_lock);
+	dout(20) << "write_thread_entry woke up" << dendl;
+	continue;
+      }
+    }
+
+#ifdef HAVE_LIBAIO
+    if (aio) {
+      Mutex::Locker locker(aio_lock);
+      // should we back off to limit aios in flight?  try to do this
+      // adaptively so that we submit larger aios once we have lots of
+      // them in flight.
+      //
+      // NOTE: our condition here is based on aio_num (protected by
+      // aio_lock) and throttle_bytes (part of the write queue).  when
+      // we sleep, we *only* wait for aio_num to change, and do not
+      // wake when more data is queued.  this is not strictly correct,
+      // but should be fine given that we will have plenty of aios in
+      // flight if we hit this limit to ensure we keep the device
+      // saturated.
+      while (aio_num > 0) {
+	int exp = std::min<int>(aio_num * 2, 24);
+	long unsigned min_new = 1ull << exp;
+	uint64_t cur = aio_write_queue_bytes;
+	dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes
+		 << " ... exp " << exp << " min_new " << min_new
+		 << " ... pending " << cur << dendl;
+	if (cur >= min_new)
+	  break;
+	dout(20) << "write_thread_entry deferring until more aios complete: "
+		 << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new
+		 << " bytes to start a new aio (currently " << cur << " pending)" << dendl;
+	aio_cond.Wait(aio_lock);
+	dout(20) << "write_thread_entry woke up" << dendl;
+      }
+    }
+#endif
+
+    Mutex::Locker locker(write_lock);
+    uint64_t orig_ops = 0;
+    uint64_t orig_bytes = 0;
+
+    bufferlist bl;
+    int r = prepare_multi_write(bl, orig_ops, orig_bytes);
+    // Don't care about journal full if stoppping, so drop queue and
+    // possibly let header get written and loop above to notice stop
+    if (r == -ENOSPC) {
+      if (write_stop) {
+	dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl;
+	while (!writeq_empty()) {
+	  complete_write(1, peek_write().orig_len);
+	  pop_write();
+	}
+	print_header(header);
+	r = 0;
+      } else {
+	dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl;
+	commit_cond.Wait(write_lock);
+	dout(20) << "write_thread_entry woke up" << dendl;
+	continue;
+      }
+    }
+    ceph_assert(r == 0);
+
+    if (logger) {
+      logger->inc(l_filestore_journal_wr);
+      logger->inc(l_filestore_journal_wr_bytes, bl.length());
+    }
+
+#ifdef HAVE_LIBAIO
+    if (aio)
+      do_aio_write(bl);
+    else
+      do_write(bl);
+#else
+    do_write(bl);
+#endif
+    complete_write(orig_ops, orig_bytes);
+  }
+
+  dout(10) << "write_thread_entry finish" << dendl;
+}
+
+#ifdef HAVE_LIBAIO
+void FileJournal::do_aio_write(bufferlist& bl)
+{
+
+  if (cct->_conf->journal_write_header_frequency &&
+      (((++journaled_since_start) %
+	cct->_conf->journal_write_header_frequency) == 0)) {
+    must_write_header = true;
+  }
+
+  // nothing to do?
+  if (bl.length() == 0 && !must_write_header)
+    return;
+
+  buffer::ptr hbp;
+  if (must_write_header) {
+    must_write_header = false;
+    hbp = prepare_header();
+  }
+
+  // entry
+  off64_t pos = write_pos;
+
+  dout(15) << "do_aio_write writing " << pos << "~" << bl.length()
+	   << (hbp.length() ? " + header":"")
+	   << dendl;
+
+  // split?
+  off64_t split = 0;
+  if (pos + bl.length() > header.max_size) {
+    bufferlist first, second;
+    split = header.max_size - pos;
+    first.substr_of(bl, 0, split);
+    second.substr_of(bl, split, bl.length() - split);
+    ceph_assert(first.length() + second.length() == bl.length());
+    dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl;
+
+    if (write_aio_bl(pos, first, 0)) {
+      derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+	   << ") failed" << dendl;
+      ceph_abort();
+    }
+    ceph_assert(pos == header.max_size);
+    if (hbp.length()) {
+      // be sneaky: include the header in the second fragment
+      bufferlist tmp;
+      tmp.push_back(hbp);
+      tmp.claim_append(second);
+      second.swap(tmp);
+      pos = 0;          // we included the header
+    } else
+      pos = get_top();  // no header, start after that
+    if (write_aio_bl(pos, second, writing_seq)) {
+      derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+	   << ") failed" << dendl;
+      ceph_abort();
+    }
+  } else {
+    // header too?
+    if (hbp.length()) {
+      bufferlist hbl;
+      hbl.push_back(hbp);
+      loff_t pos = 0;
+      if (write_aio_bl(pos, hbl, 0)) {
+	derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl;
+	ceph_abort();
+      }
+    }
+
+    if (write_aio_bl(pos, bl, writing_seq)) {
+      derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+	   << ") failed" << dendl;
+      ceph_abort();
+    }
+  }
+
+  write_pos = pos;
+  if (write_pos == header.max_size)
+    write_pos = get_top();
+  ceph_assert(write_pos % header.alignment == 0);
+}
+
+/**
+ * write a buffer using aio
+ *
+ * @param seq seq to trigger when this aio completes.  if 0, do not update any state
+ * on completion.
+ */
+int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
+{
+  dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl;
+
+  while (bl.length() > 0) {
+    int max = std::min<int>(bl.get_num_buffers(), IOV_MAX-1);
+    iovec *iov = new iovec[max];
+    int n = 0;
+    unsigned len = 0;
+    for (auto p = std::cbegin(bl.buffers()); n < max; ++p, ++n) {
+      ceph_assert(p != std::cend(bl.buffers()));
+      iov[n].iov_base = const_cast<void*>(static_cast<const void*>(p->c_str()));
+      iov[n].iov_len = p->length();
+      len += p->length();
+    }
+
+    bufferlist tbl;
+    bl.splice(0, len, &tbl);  // move bytes from bl -> tbl
+
+    // lock only aio_queue, current aio, aio_num, aio_bytes, which may be
+    // modified in check_aio_completion
+    aio_lock.Lock();
+    aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq));
+    aio_info& aio = aio_queue.back();
+    aio.iov = iov;
+
+    io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos);
+
+    dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len
+	     << " in " << n << dendl;
+
+    aio_num++;
+    aio_bytes += aio.len;
+
+    // need to save current aio len to update write_pos later because current
+    // aio could be ereased from aio_queue once it is done
+    uint64_t cur_len = aio.len;
+    // unlock aio_lock because following io_submit might take time to return
+    aio_lock.Unlock();
+
+    iocb *piocb = &aio.iocb;
+
+    // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
+    int attempts = 16;
+    int delay = 125;
+    do {
+      int r = io_submit(aio_ctx, 1, &piocb);
+      dout(20) << "write_aio_bl io_submit return value: " << r << dendl;
+      if (r < 0) {
+	derr << "io_submit to " << aio.off << "~" << cur_len
+	     << " got " << cpp_strerror(r) << dendl;
+	if (r == -EAGAIN && attempts-- > 0) {
+	  usleep(delay);
+	  delay *= 2;
+	  continue;
+	}
+	check_align(pos, tbl);
+	ceph_abort_msg("io_submit got unexpected error");
+      } else {
+	break;
+      }
+    } while (true);
+    pos += cur_len;
+  }
+  aio_lock.Lock();
+  write_finish_cond.Signal();
+  aio_lock.Unlock();
+  return 0;
+}
+#endif
+
+void FileJournal::write_finish_thread_entry()
+{
+#ifdef HAVE_LIBAIO
+  dout(10) << __func__ << " enter" << dendl;
+  while (true) {
+    {
+      Mutex::Locker locker(aio_lock);
+      if (aio_queue.empty()) {
+	if (aio_stop)
+	  break;
+	dout(20) << __func__ << " sleeping" << dendl;
+	write_finish_cond.Wait(aio_lock);
+	continue;
+      }
+    }
+
+    dout(20) << __func__ << " waiting for aio(s)" << dendl;
+    io_event event[16];
+    int r = io_getevents(aio_ctx, 1, 16, event, NULL);
+    if (r < 0) {
+      if (r == -EINTR) {
+	dout(0) << "io_getevents got " << cpp_strerror(r) << dendl;
+	continue;
+      }
+      derr << "io_getevents got " << cpp_strerror(r) << dendl;
+      if (r == -EIO) {
+	note_io_error_event(devname.c_str(), fn.c_str(), -EIO, 0, 0, 0);
+      }
+      ceph_abort_msg("got unexpected error from io_getevents");
+    }
+
+    {
+      Mutex::Locker locker(aio_lock);
+      for (int i=0; i<r; i++) {
+	aio_info *ai = (aio_info *)event[i].obj;
+	if (event[i].res != ai->len) {
+	  derr << "aio to " << ai->off << "~" << ai->len
+	       << " returned: " << (int)event[i].res << dendl;
+	  ceph_abort_msg("unexpected aio error");
+	}
+	dout(10) << __func__ << " aio " << ai->off
+		 << "~" << ai->len << " done" << dendl;
+	ai->done = true;
+      }
+      check_aio_completion();
+    }
+  }
+  dout(10) << __func__ << " exit" << dendl;
+#endif
+}
+
+#ifdef HAVE_LIBAIO
+/**
+ * check aio_wait for completed aio, and update state appropriately.
+ */
+void FileJournal::check_aio_completion()
+{
+  ceph_assert(aio_lock.is_locked());
+  dout(20) << "check_aio_completion" << dendl;
+
+  bool completed_something = false, signal = false;
+  uint64_t new_journaled_seq = 0;
+
+  list<aio_info>::iterator p = aio_queue.begin();
+  while (p != aio_queue.end() && p->done) {
+    dout(20) << "check_aio_completion completed seq " << p->seq << " "
+	     << p->off << "~" << p->len << dendl;
+    if (p->seq) {
+      new_journaled_seq = p->seq;
+      completed_something = true;
+    }
+    aio_num--;
+    aio_bytes -= p->len;
+    aio_queue.erase(p++);
+    signal = true;
+  }
+
+  if (completed_something) {
+    // kick finisher?
+    //  only if we haven't filled up recently!
+    Mutex::Locker locker(finisher_lock);
+    journaled_seq = new_journaled_seq;
+    if (full_state != FULL_NOTFULL) {
+      dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq
+	       << ", full_commit_seq|full_restart_seq" << dendl;
+    } else {
+      if (plug_journal_completions) {
+	dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq
+		 << " due to completion plug" << dendl;
+      } else {
+	dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl;
+	queue_completions_thru(journaled_seq);
+      }
+    }
+  }
+  if (signal) {
+    // maybe write queue was waiting for aio count to drop?
+    aio_cond.Signal();
+  }
+}
+#endif
+
+int FileJournal::prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) {
+  dout(10) << "prepare_entry " << tls << dendl;
+  int data_len = cct->_conf->journal_align_min_size - 1;
+  int data_align = -1; // -1 indicates that we don't care about the alignment
+  bufferlist bl;
+  for (vector<ObjectStore::Transaction>::iterator p = tls.begin();
+      p != tls.end(); ++p) {
+   if ((int)(*p).get_data_length() > data_len) {
+     data_len = (*p).get_data_length();
+     data_align = ((*p).get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK;
+    }
+    encode(*p, bl);
+  }
+  if (tbl->length()) {
+    bl.claim_append(*tbl);
+  }
+  // add it this entry
+  entry_header_t h;
+  unsigned head_size = sizeof(entry_header_t);
+  off64_t base_size = 2*head_size + bl.length();
+  memset(&h, 0, sizeof(h));
+  if (data_align >= 0)
+    h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK;
+  off64_t size = round_up_to(base_size + h.pre_pad, header.alignment);
+  unsigned post_pad = size - base_size - h.pre_pad;
+  h.len = bl.length();
+  h.post_pad = post_pad;
+  h.crc32c = bl.crc32c(0);
+  dout(10) << " len " << bl.length() << " -> " << size
+       << " (head " << head_size << " pre_pad " << h.pre_pad
+       << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")"
+       << " (bl alignment " << data_align << ")"
+       << dendl;
+  bufferlist ebl;
+  // header
+  ebl.append((const char*)&h, sizeof(h));
+  if (h.pre_pad) {
+    ebl.push_back(buffer::create_static(h.pre_pad, zero_buf));
+  }
+  // payload
+  ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+  if (h.post_pad) {
+    ebl.push_back(buffer::create_static(h.post_pad, zero_buf));
+  }
+  // footer
+  ebl.append((const char*)&h, sizeof(h));
+  if (directio)
+    ebl.rebuild_aligned(CEPH_DIRECTIO_ALIGNMENT);
+  tbl->claim(ebl);
+  return h.len;
+}
+
+void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
+			       Context *oncommit, TrackedOpRef osd_op)
+{
+  // dump on queue
+  dout(5) << "submit_entry seq " << seq
+	  << " len " << e.length()
+	  << " (" << oncommit << ")" << dendl;
+  ceph_assert(e.length() > 0);
+  ceph_assert(e.length() < header.max_size);
+
+  if (logger) {
+    logger->inc(l_filestore_journal_queue_bytes, orig_len);
+    logger->inc(l_filestore_journal_queue_ops, 1);
+  }
+
+  throttle.register_throttle_seq(seq, e.length());
+  if (logger) {
+    logger->inc(l_filestore_journal_ops, 1);
+    logger->inc(l_filestore_journal_bytes, e.length());
+  }
+
+  if (osd_op) {
+    osd_op->mark_event("commit_queued_for_journal_write");
+    if (osd_op->store_trace) {
+      osd_op->journal_trace.init("journal", &trace_endpoint, &osd_op->store_trace);
+      osd_op->journal_trace.event("submit_entry");
+      osd_op->journal_trace.keyval("seq", seq);
+    }
+  }
+  {
+    Mutex::Locker l1(writeq_lock);
+#ifdef HAVE_LIBAIO
+    Mutex::Locker l2(aio_lock);
+#endif
+    Mutex::Locker l3(completions_lock);
+
+#ifdef HAVE_LIBAIO
+    aio_write_queue_ops++;
+    aio_write_queue_bytes += e.length();
+    aio_cond.Signal();
+#endif
+
+    completions.push_back(
+      completion_item(
+	seq, oncommit, ceph_clock_now(), osd_op));
+    if (writeq.empty())
+      writeq_cond.Signal();
+    writeq.push_back(write_item(seq, e, orig_len, osd_op));
+    if (osd_op)
+      osd_op->journal_trace.keyval("queue depth", writeq.size());
+  }
+}
+
+bool FileJournal::writeq_empty()
+{
+  Mutex::Locker locker(writeq_lock);
+  return writeq.empty();
+}
+
+FileJournal::write_item &FileJournal::peek_write()
+{
+  ceph_assert(write_lock.is_locked());
+  Mutex::Locker locker(writeq_lock);
+  return writeq.front();
+}
+
+void FileJournal::pop_write()
+{
+  ceph_assert(write_lock.is_locked());
+  Mutex::Locker locker(writeq_lock);
+  if (logger) {
+    logger->dec(l_filestore_journal_queue_bytes, writeq.front().orig_len);
+    logger->dec(l_filestore_journal_queue_ops, 1);
+  }
+  writeq.pop_front();
+}
+
+void FileJournal::batch_pop_write(list<write_item> &items)
+{
+  ceph_assert(write_lock.is_locked());
+  {
+    Mutex::Locker locker(writeq_lock);
+    writeq.swap(items);
+  }
+  for (auto &&i : items) {
+    if (logger) {
+      logger->dec(l_filestore_journal_queue_bytes, i.orig_len);
+      logger->dec(l_filestore_journal_queue_ops, 1);
+    }
+  }
+}
+
+void FileJournal::batch_unpop_write(list<write_item> &items)
+{
+  ceph_assert(write_lock.is_locked());
+  for (auto &&i : items) {
+    if (logger) {
+      logger->inc(l_filestore_journal_queue_bytes, i.orig_len);
+      logger->inc(l_filestore_journal_queue_ops, 1);
+    }
+  }
+  Mutex::Locker locker(writeq_lock);
+  writeq.splice(writeq.begin(), items);
+}
+
+void FileJournal::commit_start(uint64_t seq)
+{
+  dout(10) << "commit_start" << dendl;
+
+  // was full?
+  switch (full_state) {
+  case FULL_NOTFULL:
+    break; // all good
+
+  case FULL_FULL:
+    if (seq >= journaled_seq) {
+      dout(1) << " FULL_FULL -> FULL_WAIT.  commit_start on seq "
+	      << seq << " > journaled_seq " << journaled_seq
+	      << ", moving to FULL_WAIT."
+	      << dendl;
+      full_state = FULL_WAIT;
+    } else {
+      dout(1) << "FULL_FULL commit_start on seq "
+	      << seq << " < journaled_seq " << journaled_seq
+	      << ", remaining in FULL_FULL"
+	      << dendl;
+    }
+    break;
+
+  case FULL_WAIT:
+    dout(1) << " FULL_WAIT -> FULL_NOTFULL.  journal now active, setting completion plug." << dendl;
+    full_state = FULL_NOTFULL;
+    plug_journal_completions = true;
+    break;
+  }
+}
+
+/*
+ *send discard command to joural block deivce
+ */
+void FileJournal::do_discard(int64_t offset, int64_t end)
+{
+  dout(10) << __func__ << " trim(" << offset << ", " << end << dendl;
+
+  offset = round_up_to(offset, block_size);
+  if (offset >= end)
+    return;
+  end = round_up_to(end - block_size, block_size);
+  ceph_assert(end >= offset);
+  if (offset < end) {
+    BlkDev blkdev(fd);
+    if (blkdev.discard(offset, end - offset) < 0) {
+	dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl;
+    }
+  }
+}
+
+void FileJournal::committed_thru(uint64_t seq)
+{
+  Mutex::Locker locker(write_lock);
+
+  auto released = throttle.flush(seq);
+  if (logger) {
+    logger->dec(l_filestore_journal_ops, released.first);
+    logger->dec(l_filestore_journal_bytes, released.second);
+  }
+
+  if (seq < last_committed_seq) {
+    dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl;
+    ceph_assert(seq >= last_committed_seq);
+    return;
+  }
+  if (seq == last_committed_seq) {
+    dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl;
+    return;
+  }
+
+  dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl;
+  last_committed_seq = seq;
+
+  // completions!
+  {
+    Mutex::Locker locker(finisher_lock);
+    queue_completions_thru(seq);
+    if (plug_journal_completions && seq >= header.start_seq) {
+      dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl;
+      plug_journal_completions = false;
+      queue_completions_thru(journaled_seq);
+    }
+  }
+
+  // adjust start pointer
+  while (!journalq.empty() && journalq.front().first <= seq) {
+    journalq.pop_front();
+  }
+
+  int64_t old_start = header.start;
+  if (!journalq.empty()) {
+    header.start = journalq.front().second;
+    header.start_seq = journalq.front().first;
+  } else {
+    header.start = write_pos;
+    header.start_seq = seq + 1;
+  }
+
+  if (discard) {
+    dout(10) << __func__  << " will trim (" << old_start << ", " << header.start << ")" << dendl;
+    if (old_start < header.start)
+      do_discard(old_start, header.start - 1);
+    else {
+      do_discard(old_start, header.max_size - 1);
+      do_discard(get_top(), header.start - 1);
+    }
+  }
+
+  must_write_header = true;
+  print_header(header);
+
+  // committed but unjournaled items
+  while (!writeq_empty() && peek_write().seq <= seq) {
+    dout(15) << " dropping committed but unwritten seq " << peek_write().seq
+	     << " len " << peek_write().bl.length()
+	     << dendl;
+    complete_write(1, peek_write().orig_len);
+    pop_write();
+  }
+
+  commit_cond.Signal();
+
+  dout(10) << "committed_thru done" << dendl;
+}
+
+
+void FileJournal::complete_write(uint64_t ops, uint64_t bytes)
+{
+  dout(5) << __func__ << " finished " << ops << " ops and "
+	  << bytes << " bytes" << dendl;
+}
+
+int FileJournal::make_writeable()
+{
+  dout(10) << __func__ << dendl;
+  int r = set_throttle_params();
+  if (r < 0)
+    return r;
+
+  r = _open(true);
+  if (r < 0)
+    return r;
+
+  if (read_pos > 0)
+    write_pos = read_pos;
+  else
+    write_pos = get_top();
+  read_pos = 0;
+
+  must_write_header = true;
+
+  start_writer();
+  return 0;
+}
+
+int FileJournal::set_throttle_params()
+{
+  stringstream ss;
+  bool valid = throttle.set_params(
+    cct->_conf->journal_throttle_low_threshhold,
+    cct->_conf->journal_throttle_high_threshhold,
+    cct->_conf->filestore_expected_throughput_bytes,
+    cct->_conf->journal_throttle_high_multiple,
+    cct->_conf->journal_throttle_max_multiple,
+    header.max_size - get_top(),
+    &ss);
+
+  if (!valid) {
+    derr << "tried to set invalid params: "
+	 << ss.str()
+	 << dendl;
+  }
+  return valid ? 0 : -EINVAL;
+}
+
+const char** FileJournal::get_tracked_conf_keys() const
+{
+  static const char *KEYS[] = {
+    "journal_throttle_low_threshhold",
+    "journal_throttle_high_threshhold",
+    "journal_throttle_high_multiple",
+    "journal_throttle_max_multiple",
+    "filestore_expected_throughput_bytes",
+    NULL};
+  return KEYS;
+}
+
+void FileJournal::wrap_read_bl(
+  off64_t pos,
+  int64_t olen,
+  bufferlist* bl,
+  off64_t *out_pos
+  ) const
+{
+  while (olen > 0) {
+    while (pos >= header.max_size)
+      pos = pos + get_top() - header.max_size;
+
+    int64_t len;
+    if (pos + olen > header.max_size)
+      len = header.max_size - pos;        // partial
+    else
+      len = olen;                         // rest
+
+    int64_t actual = ::lseek64(fd, pos, SEEK_SET);
+    ceph_assert(actual == pos);
+
+    bufferptr bp = buffer::create(len);
+    int r = safe_read_exact(fd, bp.c_str(), len);
+    if (r) {
+      derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned "
+	   << cpp_strerror(r) << dendl;
+      ceph_abort();
+    }
+    bl->push_back(std::move(bp));
+    pos += len;
+    olen -= len;
+  }
+  if (pos >= header.max_size)
+    pos = pos + get_top() - header.max_size;
+  if (out_pos)
+    *out_pos = pos;
+}
+
+bool FileJournal::read_entry(
+  bufferlist &bl,
+  uint64_t &next_seq,
+  bool *corrupt)
+{
+  if (corrupt)
+    *corrupt = false;
+  uint64_t seq = next_seq;
+
+  if (!read_pos) {
+    dout(2) << "read_entry -- not readable" << dendl;
+    return false;
+  }
+
+  off64_t pos = read_pos;
+  off64_t next_pos = pos;
+  stringstream ss;
+  read_entry_result result = do_read_entry(
+    pos,
+    &next_pos,
+    &bl,
+    &seq,
+    &ss);
+  if (result == SUCCESS) {
+    journalq.push_back( pair<uint64_t,off64_t>(seq, pos));
+    uint64_t amount_to_take =
+      next_pos > pos ?
+      next_pos - pos :
+      (header.max_size - pos) + (next_pos - get_top());
+    throttle.take(amount_to_take);
+    throttle.register_throttle_seq(next_seq, amount_to_take);
+    if (logger) {
+      logger->inc(l_filestore_journal_ops, 1);
+      logger->inc(l_filestore_journal_bytes, amount_to_take);
+    }
+    if (next_seq > seq) {
+      return false;
+    } else {
+      read_pos = next_pos;
+      next_seq = seq;
+      if (seq > journaled_seq)
+        journaled_seq = seq;
+      return true;
+    }
+  } else {
+    derr << "do_read_entry(" << pos << "): " << ss.str() << dendl;
+  }
+
+  if (seq && seq < header.committed_up_to) {
+    derr << "Unable to read past sequence " << seq
+	 << " but header indicates the journal has committed up through "
+	 << header.committed_up_to << ", journal is corrupt" << dendl;
+    if (cct->_conf->journal_ignore_corruption) {
+      if (corrupt)
+	*corrupt = true;
+      return false;
+    } else {
+      ceph_abort();
+    }
+  }
+
+  dout(2) << "No further valid entries found, journal is most likely valid"
+	  << dendl;
+  return false;
+}
+
+FileJournal::read_entry_result FileJournal::do_read_entry(
+  off64_t init_pos,
+  off64_t *next_pos,
+  bufferlist *bl,
+  uint64_t *seq,
+  ostream *ss,
+  entry_header_t *_h) const
+{
+  off64_t cur_pos = init_pos;
+  bufferlist _bl;
+  if (!bl)
+    bl = &_bl;
+
+  // header
+  entry_header_t *h;
+  bufferlist hbl;
+  off64_t _next_pos;
+  wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos);
+  h = reinterpret_cast<entry_header_t *>(hbl.c_str());
+
+  if (!h->check_magic(cur_pos, header.get_fsid64())) {
+    dout(25) << "read_entry " << init_pos
+	     << " : bad header magic, end of journal" << dendl;
+    if (ss)
+      *ss << "bad header magic";
+    if (next_pos)
+      *next_pos = init_pos + (4<<10); // check 4k ahead
+    return MAYBE_CORRUPT;
+  }
+  cur_pos = _next_pos;
+
+  // pad + body + pad
+  if (h->pre_pad)
+    cur_pos += h->pre_pad;
+
+  bl->clear();
+  wrap_read_bl(cur_pos, h->len, bl, &cur_pos);
+
+  if (h->post_pad)
+    cur_pos += h->post_pad;
+
+  // footer
+  entry_header_t *f;
+  bufferlist fbl;
+  wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos);
+  f = reinterpret_cast<entry_header_t *>(fbl.c_str());
+  if (memcmp(f, h, sizeof(*f))) {
+    if (ss)
+      *ss << "bad footer magic, partial entry";
+    if (next_pos)
+      *next_pos = cur_pos;
+    return MAYBE_CORRUPT;
+  }
+
+  if ((header.flags & header_t::FLAG_CRC) ||   // if explicitly enabled (new journal)
+      h->crc32c != 0) {                        // newer entry in old journal
+    uint32_t actual_crc = bl->crc32c(0);
+    if (actual_crc != h->crc32c) {
+      if (ss)
+	*ss << "header crc (" << h->crc32c
+	    << ") doesn't match body crc (" << actual_crc << ")";
+      if (next_pos)
+	*next_pos = cur_pos;
+      return MAYBE_CORRUPT;
+    }
+  }
+
+  // yay!
+  dout(2) << "read_entry " << init_pos << " : seq " << h->seq
+	  << " " << h->len << " bytes"
+	  << dendl;
+
+  // ok!
+  if (seq)
+    *seq = h->seq;
+
+
+  if (next_pos)
+    *next_pos = cur_pos;
+
+  if (_h)
+    *_h = *h;
+
+  ceph_assert(cur_pos % header.alignment == 0);
+  return SUCCESS;
+}
+
+void FileJournal::reserve_throttle_and_backoff(uint64_t count)
+{
+  throttle.get(count);
+}
+
+void FileJournal::get_header(
+  uint64_t wanted_seq,
+  off64_t *_pos,
+  entry_header_t *h)
+{
+  off64_t pos = header.start;
+  off64_t next_pos = pos;
+  bufferlist bl;
+  uint64_t seq = 0;
+  dout(2) << __func__ << dendl;
+  while (1) {
+    bl.clear();
+    pos = next_pos;
+    read_entry_result result = do_read_entry(
+      pos,
+      &next_pos,
+      &bl,
+      &seq,
+      0,
+      h);
+    if (result == FAILURE || result == MAYBE_CORRUPT)
+      ceph_abort();
+    if (seq == wanted_seq) {
+      if (_pos)
+	*_pos = pos;
+      return;
+    }
+  }
+  ceph_abort(); // not reachable
+}
+
+void FileJournal::corrupt(
+  int wfd,
+  off64_t corrupt_at)
+{
+  dout(2) << __func__ << dendl;
+  if (corrupt_at >= header.max_size)
+    corrupt_at = corrupt_at + get_top() - header.max_size;
+
+  int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET);
+  ceph_assert(actual == corrupt_at);
+
+  char buf[10];
+  int r = safe_read_exact(fd, buf, 1);
+  ceph_assert(r == 0);
+
+  actual = ::lseek64(wfd, corrupt_at, SEEK_SET);
+  ceph_assert(actual == corrupt_at);
+
+  buf[0]++;
+  r = safe_write(wfd, buf, 1);
+  ceph_assert(r == 0);
+}
+
+void FileJournal::corrupt_payload(
+  int wfd,
+  uint64_t seq)
+{
+  dout(2) << __func__ << dendl;
+  off64_t pos = 0;
+  entry_header_t h;
+  get_header(seq, &pos, &h);
+  off64_t corrupt_at =
+    pos + sizeof(entry_header_t) + h.pre_pad;
+  corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_footer_magic(
+  int wfd,
+  uint64_t seq)
+{
+  dout(2) << __func__ << dendl;
+  off64_t pos = 0;
+  entry_header_t h;
+  get_header(seq, &pos, &h);
+  off64_t corrupt_at =
+    pos + sizeof(entry_header_t) + h.pre_pad +
+    h.len + h.post_pad +
+    (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+  corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_header_magic(
+  int wfd,
+  uint64_t seq)
+{
+  dout(2) << __func__ << dendl;
+  off64_t pos = 0;
+  entry_header_t h;
+  get_header(seq, &pos, &h);
+  off64_t corrupt_at =
+    pos +
+    (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+  corrupt(wfd, corrupt_at);
+}
+
+off64_t FileJournal::get_journal_size_estimate()
+{
+  off64_t size, start = header.start;
+  if (write_pos < start) {
+    size = (max_size - start) + write_pos;
+  } else {
+    size = write_pos - start;
+  }
+  dout(20) << __func__ << " journal size=" << size << dendl;
+  return size;
+}
+
+void FileJournal::get_devices(set<string> *ls)
+{
+  string dev_node;
+  BlkDev blkdev(fd);
+  if (int rc = blkdev.wholedisk(&dev_node); rc) {
+    return;
+  }
+  get_raw_devices(dev_node, ls);
+}
+
+void FileJournal::collect_metadata(map<string,string> *pm)
+{
+  BlkDev blkdev(fd);
+  char partition_path[PATH_MAX];
+  char dev_node[PATH_MAX];
+  if (blkdev.partition(partition_path, PATH_MAX)) {
+    (*pm)["backend_filestore_journal_partition_path"] = "unknown";
+  } else {
+    (*pm)["backend_filestore_journal_partition_path"] = string(partition_path);
+  }
+  if (blkdev.wholedisk(dev_node, PATH_MAX)) {
+    (*pm)["backend_filestore_journal_dev_node"] = "unknown";
+  } else {
+    (*pm)["backend_filestore_journal_dev_node"] = string(dev_node);
+    devname = dev_node;
+  }
+}
diff --git a/src/os/filestore/FileJournal.h b/src/os/filestore/FileJournal.h
new file mode 100644
index 00000000..2313b4b8
--- /dev/null
+++ b/src/os/filestore/FileJournal.h
@@ -0,0 +1,556 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILEJOURNAL_H
+#define CEPH_FILEJOURNAL_H
+
+#include <stdlib.h>
+#include <deque>
+using std::deque;
+
+#include "Journal.h"
+#include "common/config_fwd.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+#include "common/Throttle.h"
+#include "JournalThrottle.h"
+#include "common/zipkin_trace.h"
+
+#ifdef HAVE_LIBAIO
+# include <libaio.h>
+#endif
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+/**
+ * Implements journaling on top of block device or file.
+ *
+ * Lock ordering is write_lock > aio_lock > (completions_lock | finisher_lock)
+ */
+class FileJournal :
+  public Journal,
+  public md_config_obs_t {
+public:
+  /// Protected by finisher_lock
+  struct completion_item {
+    uint64_t seq;
+    Context *finish;
+    utime_t start;
+    TrackedOpRef tracked_op;
+    completion_item(uint64_t o, Context *c, utime_t s, TrackedOpRef opref)
+      : seq(o), finish(c), start(s), tracked_op(opref) {}
+    completion_item() : seq(0), finish(0), start(0) {}
+  };
+  struct write_item {
+    uint64_t seq;
+    bufferlist bl;
+    uint32_t orig_len;
+    TrackedOpRef tracked_op;
+    ZTracer::Trace trace;
+    write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) :
+      seq(s), orig_len(ol), tracked_op(opref) {
+      bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+    }
+    write_item() : seq(0), orig_len(0) {}
+  };
+
+  Mutex finisher_lock;
+  Cond finisher_cond;
+  uint64_t journaled_seq;
+  bool plug_journal_completions;
+
+  Mutex writeq_lock;
+  Cond writeq_cond;
+  list<write_item> writeq;
+  bool writeq_empty();
+  write_item &peek_write();
+  void pop_write();
+  void batch_pop_write(list<write_item> &items);
+  void batch_unpop_write(list<write_item> &items);
+
+  Mutex completions_lock;
+  list<completion_item> completions;
+  bool completions_empty() {
+    Mutex::Locker l(completions_lock);
+    return completions.empty();
+  }
+  void batch_pop_completions(list<completion_item> &items) {
+    Mutex::Locker l(completions_lock);
+    completions.swap(items);
+  }
+  void batch_unpop_completions(list<completion_item> &items) {
+    Mutex::Locker l(completions_lock);
+    completions.splice(completions.begin(), items);
+  }
+  completion_item completion_peek_front() {
+    Mutex::Locker l(completions_lock);
+    ceph_assert(!completions.empty());
+    return completions.front();
+  }
+  void completion_pop_front() {
+    Mutex::Locker l(completions_lock);
+    ceph_assert(!completions.empty());
+    completions.pop_front();
+  }
+
+  int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) override;
+
+  void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len,
+		    Context *oncommit,
+		    TrackedOpRef osd_op = TrackedOpRef()) override;
+  /// End protected by finisher_lock
+
+  /*
+   * journal header
+   */
+  struct header_t {
+    enum {
+      FLAG_CRC = (1<<0),
+      // NOTE: remove kludgey weirdness in read_header() next time a flag is added.
+    };
+
+    uint64_t flags;
+    uuid_d fsid;
+    __u32 block_size;
+    __u32 alignment;
+    int64_t max_size;   // max size of journal ring buffer
+    int64_t start;      // offset of first entry
+    uint64_t committed_up_to; // committed up to
+
+    /**
+     * start_seq
+     *
+     * entry at header.start has sequence >= start_seq
+     *
+     * Generally, the entry at header.start will have sequence
+     * start_seq if it exists.  The only exception is immediately
+     * after journal creation since the first sequence number is
+     * not known.
+     *
+     * If the first read on open fails, we can assume corruption
+     * if start_seq > committed_up_to because the entry would have
+     * a sequence >= start_seq and therefore > committed_up_to.
+     */
+    uint64_t start_seq;
+
+    header_t() :
+      flags(0), block_size(0), alignment(0), max_size(0), start(0),
+      committed_up_to(0), start_seq(0) {}
+
+    void clear() {
+      start = block_size;
+    }
+
+    uint64_t get_fsid64() const {
+      return *(uint64_t*)fsid.bytes();
+    }
+
+    void encode(bufferlist& bl) const {
+      using ceph::encode;
+      __u32 v = 4;
+      encode(v, bl);
+      bufferlist em;
+      {
+	encode(flags, em);
+	encode(fsid, em);
+	encode(block_size, em);
+	encode(alignment, em);
+	encode(max_size, em);
+	encode(start, em);
+	encode(committed_up_to, em);
+	encode(start_seq, em);
+      }
+      encode(em, bl);
+    }
+    void decode(bufferlist::const_iterator& bl) {
+      using ceph::decode;
+      __u32 v;
+      decode(v, bl);
+      if (v < 2) {  // normally 0, but conceivably 1
+	// decode old header_t struct (pre v0.40).
+	bl.advance(4u); // skip __u32 flags (it was unused by any old code)
+	flags = 0;
+	uint64_t tfsid;
+	decode(tfsid, bl);
+	*(uint64_t*)&fsid.bytes()[0] = tfsid;
+	*(uint64_t*)&fsid.bytes()[8] = tfsid;
+	decode(block_size, bl);
+	decode(alignment, bl);
+	decode(max_size, bl);
+	decode(start, bl);
+	committed_up_to = 0;
+	start_seq = 0;
+	return;
+      }
+      bufferlist em;
+      decode(em, bl);
+      auto t = em.cbegin();
+      decode(flags, t);
+      decode(fsid, t);
+      decode(block_size, t);
+      decode(alignment, t);
+      decode(max_size, t);
+      decode(start, t);
+
+      if (v > 2)
+	decode(committed_up_to, t);
+      else
+	committed_up_to = 0;
+
+      if (v > 3)
+	decode(start_seq, t);
+      else
+	start_seq = 0;
+    }
+  } header;
+
+  struct entry_header_t {
+    uint64_t seq;     // fs op seq #
+    uint32_t crc32c;  // payload only.  not header, pre_pad, post_pad, or footer.
+    uint32_t len;
+    uint32_t pre_pad, post_pad;
+    uint64_t magic1;
+    uint64_t magic2;
+
+    static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) {
+      return (fsid ^ seq ^ len);
+    }
+    bool check_magic(off64_t pos, uint64_t fsid) {
+      return
+    magic1 == (uint64_t)pos &&
+    magic2 == (fsid ^ seq ^ len);
+    }
+  } __attribute__((__packed__, aligned(4)));
+
+  bool journalq_empty() { return journalq.empty(); }
+
+private:
+  string fn;
+
+  char *zero_buf;
+  off64_t max_size;
+  size_t block_size;
+  bool directio, aio, force_aio;
+  bool must_write_header;
+  off64_t write_pos;      // byte where the next entry to be written will go
+  off64_t read_pos;       //
+  bool discard;	  //for block journal whether support discard
+
+#ifdef HAVE_LIBAIO
+  /// state associated with an in-flight aio request
+  /// Protected by aio_lock
+  struct aio_info {
+    struct iocb iocb {};
+    bufferlist bl;
+    struct iovec *iov;
+    bool done;
+    uint64_t off, len;    ///< these are for debug only
+    uint64_t seq;         ///< seq number to complete on aio completion, if non-zero
+
+    aio_info(bufferlist& b, uint64_t o, uint64_t s)
+      : iov(NULL), done(false), off(o), len(b.length()), seq(s) {
+      bl.claim(b);
+    }
+    ~aio_info() {
+      delete[] iov;
+    }
+  };
+  Mutex aio_lock;
+  Cond aio_cond;
+  Cond write_finish_cond;
+  io_context_t aio_ctx;
+  list<aio_info> aio_queue;
+  int aio_num, aio_bytes;
+  uint64_t aio_write_queue_ops;
+  uint64_t aio_write_queue_bytes;
+  /// End protected by aio_lock
+#endif
+
+  uint64_t last_committed_seq;
+  uint64_t journaled_since_start;
+
+  string devname;
+
+  /*
+   * full states cycle at the beginnging of each commit epoch, when commit_start()
+   * is called.
+   *   FULL - we just filled up during this epoch.
+   *   WAIT - we filled up last epoch; now we have to wait until everything during
+   *          that epoch commits to the fs before we can start writing over it.
+   *   NOTFULL - all good, journal away.
+   */
+  enum {
+    FULL_NOTFULL = 0,
+    FULL_FULL = 1,
+    FULL_WAIT = 2,
+  } full_state;
+
+  int fd;
+
+  // in journal
+  deque<pair<uint64_t, off64_t> > journalq;  // track seq offsets, so we can trim later.
+  uint64_t writing_seq;
+
+
+  // throttle
+  int set_throttle_params();
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(
+    const ConfigProxy& conf,
+    const std::set <std::string> &changed) override {
+    for (const char **i = get_tracked_conf_keys();
+	 *i;
+	 ++i) {
+      if (changed.count(string(*i))) {
+	set_throttle_params();
+	return;
+      }
+    }
+  }
+
+  void complete_write(uint64_t ops, uint64_t bytes);
+  JournalThrottle throttle;
+
+  // write thread
+  Mutex write_lock;
+  bool write_stop;
+  bool aio_stop;
+
+  Cond commit_cond;
+
+  int _open(bool wr, bool create=false);
+  int _open_block_device();
+  void _close(int fd) const;
+  int _open_file(int64_t oldsize, blksize_t blksize, bool create);
+  int _dump(ostream& out, bool simple);
+  void print_header(const header_t &hdr) const;
+  int read_header(header_t *hdr) const;
+  bufferptr prepare_header();
+  void start_writer();
+  void stop_writer();
+  void write_thread_entry();
+
+  void queue_completions_thru(uint64_t seq);
+
+  int check_for_full(uint64_t seq, off64_t pos, off64_t size);
+  int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee);
+  int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos,
+    uint64_t& orig_ops, uint64_t& orig_bytes);
+  void do_write(bufferlist& bl);
+
+  void write_finish_thread_entry();
+  void check_aio_completion();
+  void do_aio_write(bufferlist& bl);
+  int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq);
+
+
+  void check_align(off64_t pos, bufferlist& bl);
+  int write_bl(off64_t& pos, bufferlist& bl);
+
+  /// read len from journal starting at in_pos and wrapping up to len
+  void wrap_read_bl(
+    off64_t in_pos,   ///< [in] start position
+    int64_t len,      ///< [in] length to read
+    bufferlist* bl,   ///< [out] result
+    off64_t *out_pos  ///< [out] next position to read, will be wrapped
+    ) const;
+
+  void do_discard(int64_t offset, int64_t end);
+
+  class Writer : public Thread {
+    FileJournal *journal;
+  public:
+    explicit Writer(FileJournal *fj) : journal(fj) {}
+    void *entry() override {
+      journal->write_thread_entry();
+      return 0;
+    }
+  } write_thread;
+
+  class WriteFinisher : public Thread {
+    FileJournal *journal;
+  public:
+    explicit WriteFinisher(FileJournal *fj) : journal(fj) {}
+    void *entry() override {
+      journal->write_finish_thread_entry();
+      return 0;
+    }
+  } write_finish_thread;
+
+  off64_t get_top() const {
+    return round_up_to(sizeof(header), block_size);
+  }
+
+  ZTracer::Endpoint trace_endpoint;
+
+ public:
+  FileJournal(CephContext* cct, uuid_d fsid, Finisher *fin, Cond *sync_cond,
+	      const char *f, bool dio=false, bool ai=true, bool faio=false) :
+    Journal(cct, fsid, fin, sync_cond),
+    finisher_lock("FileJournal::finisher_lock", false, true, false),
+    journaled_seq(0),
+    plug_journal_completions(false),
+    writeq_lock("FileJournal::writeq_lock", false, true, false),
+    completions_lock(
+      "FileJournal::completions_lock", false, true, false),
+    fn(f),
+    zero_buf(NULL),
+    max_size(0), block_size(0),
+    directio(dio), aio(ai), force_aio(faio),
+    must_write_header(false),
+    write_pos(0), read_pos(0),
+    discard(false),
+#ifdef HAVE_LIBAIO
+    aio_lock("FileJournal::aio_lock"),
+    aio_ctx(0),
+    aio_num(0), aio_bytes(0),
+    aio_write_queue_ops(0),
+    aio_write_queue_bytes(0),
+#endif
+    last_committed_seq(0),
+    journaled_since_start(0),
+    full_state(FULL_NOTFULL),
+    fd(-1),
+    writing_seq(0),
+    throttle(cct->_conf->filestore_caller_concurrency),
+    write_lock("FileJournal::write_lock", false, true, false),
+    write_stop(true),
+    aio_stop(true),
+    write_thread(this),
+    write_finish_thread(this),
+    trace_endpoint("0.0.0.0", 0, "FileJournal") {
+
+      if (aio && !directio) {
+	lderr(cct) << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
+        aio = false;
+      }
+#ifndef HAVE_LIBAIO
+      if (aio && ::getenv("CEPH_DEV") == NULL) {
+	lderr(cct) << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl;
+        aio = false;
+      }
+#endif
+
+      cct->_conf.add_observer(this);
+  }
+  ~FileJournal() override {
+    ceph_assert(fd == -1);
+    delete[] zero_buf;
+    cct->_conf.remove_observer(this);
+  }
+
+  int check() override;
+  int create() override;
+  int open(uint64_t fs_op_seq) override;
+  void close() override;
+  int peek_fsid(uuid_d& fsid);
+
+  int dump(ostream& out) override;
+  int simple_dump(ostream& out);
+  int _fdump(Formatter &f, bool simple);
+
+  void flush() override;
+
+  void get_devices(set<string> *ls) override;
+  void collect_metadata(map<string,string> *pm) override;
+
+  void reserve_throttle_and_backoff(uint64_t count) override;
+
+  bool is_writeable() override {
+    return read_pos == 0;
+  }
+  int make_writeable() override;
+
+  // writes
+  void commit_start(uint64_t seq) override;
+  void committed_thru(uint64_t seq) override;
+  bool should_commit_now() override {
+    return full_state != FULL_NOTFULL && !write_stop;
+  }
+
+  void write_header_sync();
+
+  void set_wait_on_full(bool b) { wait_on_full = b; }
+
+  off64_t get_journal_size_estimate();
+
+  // reads
+
+  /// Result code for read_entry
+  enum read_entry_result {
+    SUCCESS,
+    FAILURE,
+    MAYBE_CORRUPT
+  };
+
+  /**
+   * read_entry
+   *
+   * Reads next entry starting at pos.  If the entry appears
+   * clean, *bl will contain the payload, *seq will contain
+   * the sequence number, and *out_pos will reflect the next
+   * read position.  If the entry is invalid *ss will contain
+   * debug text, while *seq, *out_pos, and *bl will be unchanged.
+   *
+   * If the entry suggests a corrupt log, *ss will contain debug
+   * text, *out_pos will contain the next index to check.  If
+   * we find an entry in this way that returns SUCCESS, the journal
+   * is most likely corrupt.
+   */
+  read_entry_result do_read_entry(
+    off64_t pos,          ///< [in] position to read
+    off64_t *next_pos,    ///< [out] next position to read
+    bufferlist* bl,       ///< [out] payload for successful read
+    uint64_t *seq,        ///< [out] seq of successful read
+    ostream *ss,          ///< [out] error output
+    entry_header_t *h = 0 ///< [out] header
+    ) const; ///< @return result code
+
+  bool read_entry(
+    bufferlist &bl,
+    uint64_t &last_seq,
+    bool *corrupt
+    );
+
+  bool read_entry(
+    bufferlist &bl,
+    uint64_t &last_seq) override {
+    return read_entry(bl, last_seq, 0);
+  }
+
+  // Debug/Testing
+  void get_header(
+    uint64_t wanted_seq,
+    off64_t *_pos,
+    entry_header_t *h);
+  void corrupt(
+    int wfd,
+    off64_t corrupt_at);
+  void corrupt_payload(
+    int wfd,
+    uint64_t seq);
+  void corrupt_footer_magic(
+    int wfd,
+    uint64_t seq);
+  void corrupt_header_magic(
+    int wfd,
+    uint64_t seq);
+};
+
+WRITE_CLASS_ENCODER(FileJournal::header_t)
+
+#endif
diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc
new file mode 100644
index 00000000..d387947e
--- /dev/null
+++ b/src/os/filestore/FileStore.cc
@@ -0,0 +1,6425 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "include/compat.h"
+#include "include/int_types.h"
+#include "boost/tuple/tuple.hpp"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <errno.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#include <linux/falloc.h>
+#endif
+
+#include <iostream>
+#include <map>
+
+#include "include/linux_fiemap.h"
+
+#include "common/xattr.h"
+#include "chain_xattr.h"
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+
+#include <fstream>
+#include <sstream>
+
+#include "FileStore.h"
+#include "GenericFileStoreBackend.h"
+#include "BtrfsFileStoreBackend.h"
+#include "XfsFileStoreBackend.h"
+#include "ZFSFileStoreBackend.h"
+#include "common/BackTrace.h"
+#include "include/types.h"
+#include "FileJournal.h"
+
+#include "osd/osd_types.h"
+#include "include/color.h"
+#include "include/buffer.h"
+
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/run_cmd.h"
+#include "common/safe_io.h"
+#include "common/perf_counters.h"
+#include "common/sync_filesystem.h"
+#include "common/fd.h"
+#include "HashIndex.h"
+#include "DBObjectMap.h"
+#include "kv/KeyValueDB.h"
+
+#include "common/ceph_crypto.h"
+using ceph::crypto::SHA1;
+
+#include "include/ceph_assert.h"
+
+#include "common/config.h"
+#include "common/blkdev.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/objectstore.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore(" << basedir << ") "
+
+#define COMMIT_SNAP_ITEM "snap_%llu"
+#define CLUSTER_SNAP_ITEM "clustersnap_%s"
+
+#define REPLAY_GUARD_XATTR "user.cephos.seq"
+#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
+
+// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether
+// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file
+// xattrs and the value is "no", it indicates no xattrs in DBObjectMap
+#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out"
+#define XATTR_NO_SPILL_OUT "0"
+#define XATTR_SPILL_OUT "1"
+#define __FUNC__ __func__ << "(" << __LINE__ << ")"
+
+//Initial features in new superblock.
+static CompatSet get_fs_initial_compat_set() {
+  CompatSet::FeatureSet ceph_osd_feature_compat;
+  CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+  CompatSet::FeatureSet ceph_osd_feature_incompat;
+  return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+		   ceph_osd_feature_incompat);
+}
+
+//Features are added here that this FileStore supports.
+static CompatSet get_fs_supported_compat_set() {
+  CompatSet compat =  get_fs_initial_compat_set();
+  //Any features here can be set in code, but not in initial superblock
+  compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+  return compat;
+}
+
+int FileStore::validate_hobject_key(const hobject_t &obj) const
+{
+  unsigned len = LFNIndex::get_max_escaped_name_len(obj);
+  return len > m_filestore_max_xattr_value_size ? -ENAMETOOLONG : 0;
+}
+
+int FileStore::get_block_device_fsid(CephContext* cct, const string& path,
+				     uuid_d *fsid)
+{
+  // make sure we don't try to use aio or direct_io (and get annoying
+  // error messages from failing to do so); performance implications
+  // should be irrelevant for this use
+  FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false);
+  return j.peek_fsid(*fsid);
+}
+
+void FileStore::FSPerfTracker::update_from_perfcounters(
+  PerfCounters &logger)
+{
+  os_commit_latency_ns.consume_next(
+    logger.get_tavg_ns(
+      l_filestore_journal_latency));
+  os_apply_latency_ns.consume_next(
+    logger.get_tavg_ns(
+      l_filestore_apply_latency));
+}
+
+
+ostream& operator<<(ostream& out, const FileStore::OpSequencer& s)
+{
+  return out << "osr(" << s.cid << ")";
+}
+
+int FileStore::get_cdir(const coll_t& cid, char *s, int len)
+{
+  const string &cid_str(cid.to_str());
+  return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str());
+}
+
+void FileStore::handle_eio()
+{
+  // don't try to map this back to an offset; too hard since there is
+  // a file system in between.  we also don't really know whether this
+  // was a read or a write, since we have so many layers beneath us.
+  // don't even try.
+  note_io_error_event(devname.c_str(), basedir.c_str(), -EIO, 0, 0, 0);
+  ceph_abort_msg("unexpected eio error");
+}
+
+int FileStore::get_index(const coll_t& cid, Index *index)
+{
+  int r = index_manager.get_index(cid, basedir, index);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+int FileStore::init_index(const coll_t& cid)
+{
+  char path[PATH_MAX];
+  get_cdir(cid, path, sizeof(path));
+  int r = index_manager.init_index(cid, path, target_version);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path)
+{
+  IndexedPath path2;
+  if (!path)
+    path = &path2;
+  int r, exist;
+  ceph_assert(index.index);
+  r = (index.index)->lookup(oid, path, &exist);
+  if (r < 0) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  if (!exist)
+    return -ENOENT;
+  return 0;
+}
+
+int FileStore::lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length)
+{
+  FDRef fd;
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0)
+    return r;
+  r = ::ftruncate(**fd, length);
+  if (r < 0)
+    r = -errno;
+  if (r >= 0 && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_truncate(**fd, length);
+    ceph_assert(rc >= 0);
+  }
+  lfn_close(fd);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+int FileStore::lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf)
+{
+  IndexedPath path;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0)
+    return r;
+
+  ceph_assert(index.index);
+  RWLock::RLocker l((index.index)->access_lock);
+
+  r = lfn_find(oid, index, &path);
+  if (r < 0)
+    return r;
+  r = ::stat(path->path(), buf);
+  if (r < 0)
+    r = -errno;
+  return r;
+}
+
+int FileStore::lfn_open(const coll_t& cid,
+			const ghobject_t& oid,
+			bool create,
+			FDRef *outfd,
+                        Index *index)
+{
+  ceph_assert(outfd);
+  int r = 0;
+  bool need_lock = true;
+  int flags = O_RDWR;
+
+  if (create)
+    flags |= O_CREAT;
+  if (cct->_conf->filestore_odsync_write) {
+    flags |= O_DSYNC;
+  }
+
+  Index index2;
+  if (!index) {
+    index = &index2;
+  }
+  if (!((*index).index)) {
+    r = get_index(cid, index);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+      return r;
+    }
+  } else {
+    need_lock = false;
+  }
+
+  int fd, exist;
+  ceph_assert((*index).index);
+  if (need_lock) {
+    ((*index).index)->access_lock.get_write();
+  }
+  if (!replaying) {
+    *outfd = fdcache.lookup(oid);
+    if (*outfd) {
+      if (need_lock) {
+        ((*index).index)->access_lock.put_write();
+      }
+      return 0;
+    }
+  }
+
+
+  IndexedPath path2;
+  IndexedPath *path = &path2;
+
+  r = (*index)->lookup(oid, path, &exist);
+  if (r < 0) {
+    derr << "could not find " << oid << " in index: "
+      << cpp_strerror(-r) << dendl;
+    goto fail;
+  }
+
+  r = ::open((*path)->path(), flags|O_CLOEXEC, 0644);
+  if (r < 0) {
+    r = -errno;
+    dout(10) << "error opening file " << (*path)->path() << " with flags="
+      << flags << ": " << cpp_strerror(-r) << dendl;
+    goto fail;
+  }
+  fd = r;
+  if (create && (!exist)) {
+    r = (*index)->created(oid, (*path)->path());
+    if (r < 0) {
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+      derr << "error creating " << oid << " (" << (*path)->path()
+          << ") in index: " << cpp_strerror(-r) << dendl;
+      goto fail;
+    }
+    r = chain_fsetxattr<true, true>(
+      fd, XATTR_SPILL_OUT_NAME,
+      XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT));
+    if (r < 0) {
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+      derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path()
+                     << "):" << cpp_strerror(-r) << dendl;
+      goto fail;
+    }
+  }
+
+  if (!replaying) {
+    bool existed;
+    *outfd = fdcache.add(oid, fd, &existed);
+    if (existed) {
+      TEMP_FAILURE_RETRY(::close(fd));
+    }
+  } else {
+    *outfd = std::make_shared<FDCache::FD>(fd);
+  }
+
+  if (need_lock) {
+    ((*index).index)->access_lock.put_write();
+  }
+
+  return 0;
+
+ fail:
+
+  if (need_lock) {
+    ((*index).index)->access_lock.put_write();
+  }
+
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+void FileStore::lfn_close(FDRef fd)
+{
+}
+
+int FileStore::lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid)
+{
+  Index index_new, index_old;
+  IndexedPath path_new, path_old;
+  int exist;
+  int r;
+  bool index_same = false;
+  if (c < newcid) {
+    r = get_index(newcid, &index_new);
+    if (r < 0)
+      return r;
+    r = get_index(c, &index_old);
+    if (r < 0)
+      return r;
+  } else if (c == newcid) {
+    r = get_index(c, &index_old);
+    if (r < 0)
+      return r;
+    index_new = index_old;
+    index_same = true;
+  } else {
+    r = get_index(c, &index_old);
+    if (r < 0)
+      return r;
+    r = get_index(newcid, &index_new);
+    if (r < 0)
+      return r;
+  }
+
+  ceph_assert(index_old.index);
+  ceph_assert(index_new.index);
+
+  if (!index_same) {
+
+    RWLock::RLocker l1((index_old.index)->access_lock);
+
+    r = index_old->lookup(o, &path_old, &exist);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+    if (!exist)
+      return -ENOENT;
+
+    RWLock::WLocker l2((index_new.index)->access_lock);
+
+    r = index_new->lookup(newoid, &path_new, &exist);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+    if (exist)
+      return -EEXIST;
+
+    dout(25) << __FUNC__ << ": path_old: " << path_old << dendl;
+    dout(25) << __FUNC__ << ": path_new: " << path_new << dendl;
+    r = ::link(path_old->path(), path_new->path());
+    if (r < 0)
+      return -errno;
+
+    r = index_new->created(newoid, path_new->path());
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+  } else {
+    RWLock::WLocker l1((index_old.index)->access_lock);
+
+    r = index_old->lookup(o, &path_old, &exist);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+    if (!exist)
+      return -ENOENT;
+
+    r = index_new->lookup(newoid, &path_new, &exist);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+    if (exist)
+      return -EEXIST;
+
+    dout(25) << __FUNC__ << ": path_old: " << path_old << dendl;
+    dout(25) << __FUNC__ << ": path_new: " << path_new << dendl;
+    r = ::link(path_old->path(), path_new->path());
+    if (r < 0)
+      return -errno;
+
+    // make sure old fd for unlinked/overwritten file is gone
+    fdcache.clear(newoid);
+
+    r = index_new->created(newoid, path_new->path());
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+  }
+  return 0;
+}
+
+int FileStore::lfn_unlink(const coll_t& cid, const ghobject_t& o,
+			  const SequencerPosition &spos,
+			  bool force_clear_omap)
+{
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0) {
+    dout(25) << __FUNC__ << ": get_index failed " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  ceph_assert(index.index);
+  RWLock::WLocker l((index.index)->access_lock);
+
+  {
+    IndexedPath path;
+    int hardlink;
+    r = index->lookup(o, &path, &hardlink);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+
+    if (!force_clear_omap) {
+      if (hardlink == 0 || hardlink == 1) {
+	  force_clear_omap = true;
+      }
+    }
+    if (force_clear_omap) {
+      dout(20) << __FUNC__ << ": clearing omap on " << o
+	       << " in cid " << cid << dendl;
+      r = object_map->clear(o, &spos);
+      if (r < 0 && r != -ENOENT) {
+	dout(25) << __FUNC__ << ": omap clear failed " << cpp_strerror(r) << dendl;
+	if (r == -EIO && m_filestore_fail_eio) handle_eio();
+	return r;
+      }
+      if (cct->_conf->filestore_debug_inject_read_err) {
+	debug_obj_on_delete(o);
+      }
+      if (!m_disable_wbthrottle) {
+        wbthrottle.clear_object(o); // should be only non-cache ref
+      }
+      fdcache.clear(o);
+    } else {
+      /* Ensure that replay of this op doesn't result in the object_map
+       * going away.
+       */
+      if (!backend->can_checkpoint())
+	object_map->sync(&o, &spos);
+    }
+    if (hardlink == 0) {
+      if (!m_disable_wbthrottle) {
+	wbthrottle.clear_object(o); // should be only non-cache ref
+      }
+      return 0;
+    }
+  }
+  r = index->unlink(o);
+  if (r < 0) {
+    dout(25) << __FUNC__ << ": index unlink failed " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+FileStore::FileStore(CephContext* cct, const std::string &base,
+		     const std::string &jdev, osflagbits_t flags,
+		     const char *name, bool do_update) :
+  JournalingObjectStore(cct, base),
+  internal_name(name),
+  basedir(base), journalpath(jdev),
+  generic_flags(flags),
+  blk_size(0),
+  fsid_fd(-1), op_fd(-1),
+  basedir_fd(-1), current_fd(-1),
+  backend(nullptr),
+  index_manager(cct, do_update),
+  lock("FileStore::lock"),
+  force_sync(false),
+  sync_entry_timeo_lock("FileStore::sync_entry_timeo_lock"),
+  timer(cct, sync_entry_timeo_lock),
+  stop(false), sync_thread(this),
+  coll_lock("FileStore::coll_lock"),
+  fdcache(cct),
+  wbthrottle(cct),
+  next_osr_id(0),
+  m_disable_wbthrottle(cct->_conf->filestore_odsync_write ||
+                      !cct->_conf->filestore_wbthrottle_enable),
+  throttle_ops(cct, "filestore_ops", cct->_conf->filestore_caller_concurrency),
+  throttle_bytes(cct, "filestore_bytes", cct->_conf->filestore_caller_concurrency),
+  m_ondisk_finisher_num(cct->_conf->filestore_ondisk_finisher_threads),
+  m_apply_finisher_num(cct->_conf->filestore_apply_finisher_threads),
+  op_tp(cct, "FileStore::op_tp", "tp_fstore_op", cct->_conf->filestore_op_threads, "filestore_op_threads"),
+  op_wq(this, cct->_conf->filestore_op_thread_timeout,
+	cct->_conf->filestore_op_thread_suicide_timeout, &op_tp),
+  logger(nullptr),
+  trace_endpoint("0.0.0.0", 0, "FileStore"),
+  read_error_lock("FileStore::read_error_lock"),
+  m_filestore_commit_timeout(cct->_conf->filestore_commit_timeout),
+  m_filestore_journal_parallel(cct->_conf->filestore_journal_parallel ),
+  m_filestore_journal_trailing(cct->_conf->filestore_journal_trailing),
+  m_filestore_journal_writeahead(cct->_conf->filestore_journal_writeahead),
+  m_filestore_fiemap_threshold(cct->_conf->filestore_fiemap_threshold),
+  m_filestore_max_sync_interval(cct->_conf->filestore_max_sync_interval),
+  m_filestore_min_sync_interval(cct->_conf->filestore_min_sync_interval),
+  m_filestore_fail_eio(cct->_conf->filestore_fail_eio),
+  m_filestore_fadvise(cct->_conf->filestore_fadvise),
+  do_update(do_update),
+  m_journal_dio(cct->_conf->journal_dio),
+  m_journal_aio(cct->_conf->journal_aio),
+  m_journal_force_aio(cct->_conf->journal_force_aio),
+  m_osd_rollback_to_cluster_snap(cct->_conf->osd_rollback_to_cluster_snap),
+  m_osd_use_stale_snap(cct->_conf->osd_use_stale_snap),
+  m_filestore_do_dump(false),
+  m_filestore_dump_fmt(true),
+  m_filestore_sloppy_crc(cct->_conf->filestore_sloppy_crc),
+  m_filestore_sloppy_crc_block_size(cct->_conf->filestore_sloppy_crc_block_size),
+  m_filestore_max_alloc_hint_size(cct->_conf->filestore_max_alloc_hint_size),
+  m_fs_type(0),
+  m_filestore_max_inline_xattr_size(0),
+  m_filestore_max_inline_xattrs(0),
+  m_filestore_max_xattr_value_size(0)
+{
+  m_filestore_kill_at = cct->_conf->filestore_kill_at;
+  for (int i = 0; i < m_ondisk_finisher_num; ++i) {
+    ostringstream oss;
+    oss << "filestore-ondisk-" << i;
+    Finisher *f = new Finisher(cct, oss.str(), "fn_odsk_fstore");
+    ondisk_finishers.push_back(f);
+  }
+  for (int i = 0; i < m_apply_finisher_num; ++i) {
+    ostringstream oss;
+    oss << "filestore-apply-" << i;
+    Finisher *f = new Finisher(cct, oss.str(), "fn_appl_fstore");
+    apply_finishers.push_back(f);
+  }
+
+  ostringstream oss;
+  oss << basedir << "/current";
+  current_fn = oss.str();
+
+  ostringstream sss;
+  sss << basedir << "/current/commit_op_seq";
+  current_op_seq_fn = sss.str();
+
+  ostringstream omss;
+  if (cct->_conf->filestore_omap_backend_path != "") {
+      omap_dir = cct->_conf->filestore_omap_backend_path;
+  } else {
+      omss << basedir << "/current/omap";
+      omap_dir = omss.str();
+  }
+
+  // initialize logger
+  PerfCountersBuilder plb(cct, internal_name, l_filestore_first, l_filestore_last);
+
+  plb.add_u64(l_filestore_journal_queue_ops, "journal_queue_ops", "Operations in journal queue");
+  plb.add_u64(l_filestore_journal_ops, "journal_ops", "Active journal entries to be applied");
+  plb.add_u64(l_filestore_journal_queue_bytes, "journal_queue_bytes", "Size of journal queue");
+  plb.add_u64(l_filestore_journal_bytes, "journal_bytes", "Active journal operation size to be applied");
+  plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency",
+                   NULL, PerfCountersBuilder::PRIO_USEFUL);
+  plb.add_u64_counter(l_filestore_journal_wr, "journal_wr", "Journal write IOs");
+  plb.add_u64_avg(l_filestore_journal_wr_bytes, "journal_wr_bytes", "Journal data written");
+  plb.add_u64(l_filestore_op_queue_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue");
+  plb.add_u64(l_filestore_op_queue_ops, "op_queue_ops", "Operations in writing to FS queue");
+  plb.add_u64_counter(l_filestore_ops, "ops", "Operations written to store");
+  plb.add_u64(l_filestore_op_queue_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue");
+  plb.add_u64(l_filestore_op_queue_bytes, "op_queue_bytes", "Size of writing to FS queue");
+  plb.add_u64_counter(l_filestore_bytes, "bytes", "Data written to store");
+  plb.add_time_avg(l_filestore_apply_latency, "apply_latency", "Apply latency");
+  plb.add_u64(l_filestore_committing, "committing", "Is currently committing");
+
+  plb.add_u64_counter(l_filestore_commitcycle, "commitcycle", "Commit cycles");
+  plb.add_time_avg(l_filestore_commitcycle_interval, "commitcycle_interval", "Average interval between commits");
+  plb.add_time_avg(l_filestore_commitcycle_latency, "commitcycle_latency", "Average latency of commit");
+  plb.add_u64_counter(l_filestore_journal_full, "journal_full", "Journal writes while full");
+  plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg",
+                   "Store operation queue latency", NULL, PerfCountersBuilder::PRIO_USEFUL);
+  plb.add_time(l_filestore_sync_pause_max_lat, "sync_pause_max_latency", "Max latency of op_wq pause before syncfs");
+
+  logger = plb.create_perf_counters();
+
+  cct->get_perfcounters_collection()->add(logger);
+  cct->_conf.add_observer(this);
+
+  superblock.compat_features = get_fs_initial_compat_set();
+}
+
+FileStore::~FileStore()
+{
+  for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+    delete *it;
+    *it = nullptr;
+  }
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    delete *it;
+    *it = nullptr;
+  }
+  cct->_conf.remove_observer(this);
+  cct->get_perfcounters_collection()->remove(logger);
+
+  if (journal)
+    journal->logger = nullptr;
+  delete logger;
+  logger = nullptr;
+
+  if (m_filestore_do_dump) {
+    dump_stop();
+  }
+}
+
+static void get_attrname(const char *name, char *buf, int len)
+{
+  snprintf(buf, len, "user.ceph.%s", name);
+}
+
+bool parse_attrname(char **name)
+{
+  if (strncmp(*name, "user.ceph.", 10) == 0) {
+    *name += 10;
+    return true;
+  }
+  return false;
+}
+
+void FileStore::collect_metadata(map<string,string> *pm)
+{
+  char partition_path[PATH_MAX];
+  char dev_node[PATH_MAX];
+
+  (*pm)["filestore_backend"] = backend->get_name();
+  ostringstream ss;
+  ss << "0x" << std::hex << m_fs_type << std::dec;
+  (*pm)["filestore_f_type"] = ss.str();
+
+  if (cct->_conf->filestore_collect_device_partition_information) {
+    int rc = 0;
+    BlkDev blkdev(fsid_fd);
+    if (rc = blkdev.partition(partition_path, PATH_MAX); rc) {
+      (*pm)["backend_filestore_partition_path"] = "unknown";
+    } else {
+      (*pm)["backend_filestore_partition_path"] = string(partition_path);
+    }
+    if (rc = blkdev.wholedisk(dev_node, PATH_MAX); rc) {
+      (*pm)["backend_filestore_dev_node"] = "unknown";
+    } else {
+      (*pm)["backend_filestore_dev_node"] = string(dev_node);
+      devname = dev_node;
+    }
+    if (rc == 0 && vdo_fd >= 0) {
+      (*pm)["vdo"] = "true";
+      (*pm)["vdo_physical_size"] =
+	stringify(4096 * get_vdo_stat(vdo_fd, "physical_blocks"));
+    }
+    if (journal) {
+      journal->collect_metadata(pm);
+    }
+  }
+}
+
+int FileStore::get_devices(set<string> *ls)
+{
+  string dev_node;
+  BlkDev blkdev(fsid_fd);
+  if (int rc = blkdev.wholedisk(&dev_node); rc) {
+    return rc;
+  }
+  get_raw_devices(dev_node, ls);
+  if (journal) {
+    journal->get_devices(ls);
+  }
+  return 0;
+}
+
+int FileStore::statfs(struct store_statfs_t *buf0, osd_alert_list_t* alerts)
+{
+  struct statfs buf;
+  buf0->reset();
+  if (alerts) {
+    alerts->clear(); // returns nothing for now
+  }
+  if (::statfs(basedir.c_str(), &buf) < 0) {
+    int r = -errno;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    ceph_assert(r != -ENOENT);
+    return r;
+  }
+
+  uint64_t bfree = buf.f_bavail * buf.f_bsize;
+
+  // assume all of leveldb/rocksdb is omap.
+  {
+    map<string,uint64_t> kv_usage;
+    buf0->omap_allocated += object_map->get_db()->get_estimated_size(kv_usage);
+  }
+
+  uint64_t thin_total, thin_avail;
+  if (get_vdo_utilization(vdo_fd, &thin_total, &thin_avail)) {
+    buf0->total = thin_total;
+    bfree = std::min(bfree, thin_avail);
+    buf0->allocated = thin_total - thin_avail;
+    buf0->data_stored = bfree;
+  } else {
+    buf0->total = buf.f_blocks * buf.f_bsize;
+    buf0->allocated = bfree;
+    buf0->data_stored = bfree;
+  }
+  buf0->available = bfree;
+
+  // FIXME: we don't know how to populate buf->internal_metadata; XFS doesn't
+  // tell us what its internal overhead is.
+
+  // Adjust for writes pending in the journal
+  if (journal) {
+    uint64_t estimate = journal->get_journal_size_estimate();
+    buf0->internally_reserved = estimate;
+    if (buf0->available > estimate)
+      buf0->available -= estimate;
+    else
+      buf0->available = 0;
+  }
+
+  return 0;
+}
+
+int FileStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf)
+{
+  return -ENOTSUP;
+}
+
+void FileStore::new_journal()
+{
+  if (journalpath.length()) {
+    dout(10) << "open_journal at " << journalpath << dendl;
+    journal = new FileJournal(cct, fsid, &finisher, &sync_cond,
+			      journalpath.c_str(),
+			      m_journal_dio, m_journal_aio,
+			      m_journal_force_aio);
+    if (journal)
+      journal->logger = logger;
+  }
+  return;
+}
+
+int FileStore::dump_journal(ostream& out)
+{
+  int r;
+
+  if (!journalpath.length())
+    return -EINVAL;
+
+  FileJournal *journal = new FileJournal(cct, fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio);
+  r = journal->dump(out);
+  delete journal;
+  journal = nullptr;
+  return r;
+}
+
+FileStoreBackend *FileStoreBackend::create(unsigned long f_type, FileStore *fs)
+{
+  switch (f_type) {
+#if defined(__linux__)
+  case BTRFS_SUPER_MAGIC:
+    return new BtrfsFileStoreBackend(fs);
+# ifdef HAVE_LIBXFS
+  case XFS_SUPER_MAGIC:
+    return new XfsFileStoreBackend(fs);
+# endif
+#endif
+#ifdef HAVE_LIBZFS
+  case ZFS_SUPER_MAGIC:
+    return new ZFSFileStoreBackend(fs);
+#endif
+  default:
+    return new GenericFileStoreBackend(fs);
+  }
+}
+
+void FileStore::create_backend(unsigned long f_type)
+{
+  m_fs_type = f_type;
+
+  ceph_assert(!backend);
+  backend = FileStoreBackend::create(f_type, this);
+
+  dout(0) << "backend " << backend->get_name()
+	  << " (magic 0x" << std::hex << f_type << std::dec << ")"
+	  << dendl;
+
+  switch (f_type) {
+#if defined(__linux__)
+  case BTRFS_SUPER_MAGIC:
+    if (!m_disable_wbthrottle){
+      wbthrottle.set_fs(WBThrottle::BTRFS);
+    }
+    break;
+
+  case XFS_SUPER_MAGIC:
+    // wbthrottle is constructed with fs(WBThrottle::XFS)
+    break;
+#endif
+  }
+
+  set_xattr_limits_via_conf();
+}
+
+int FileStore::mkfs()
+{
+  int ret = 0;
+  char fsid_fn[PATH_MAX];
+  char fsid_str[40];
+  uuid_d old_fsid;
+  uuid_d old_omap_fsid;
+
+  dout(1) << "mkfs in " << basedir << dendl;
+  basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+  if (basedir_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  // open+lock fsid
+  snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str());
+  fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644);
+  if (fsid_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl;
+    goto close_basedir_fd;
+  }
+
+  if (lock_fsid() < 0) {
+    ret = -EBUSY;
+    goto close_fsid_fd;
+  }
+
+  if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) {
+    if (fsid.is_zero()) {
+      fsid.generate_random();
+      dout(1) << __FUNC__ << ": generated fsid " << fsid << dendl;
+    } else {
+      dout(1) << __FUNC__ << ": using provided fsid " << fsid << dendl;
+    }
+
+    fsid.print(fsid_str);
+    strcat(fsid_str, "\n");
+    ret = ::ftruncate(fsid_fd, 0);
+    if (ret < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": failed to truncate fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_fsid_fd;
+    }
+    ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str));
+    if (ret < 0) {
+      derr << __FUNC__ << ": failed to write fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_fsid_fd;
+    }
+    if (::fsync(fsid_fd) < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": close failed: can't write fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_fsid_fd;
+    }
+    dout(10) << __FUNC__ << ": fsid is " << fsid << dendl;
+  } else {
+    if (!fsid.is_zero() && fsid != old_fsid) {
+      derr << __FUNC__ << ": on-disk fsid " << old_fsid << " != provided " << fsid << dendl;
+      ret = -EINVAL;
+      goto close_fsid_fd;
+    }
+    fsid = old_fsid;
+    dout(1) << __FUNC__ << ": fsid is already set to " << fsid << dendl;
+  }
+
+  // version stamp
+  ret = write_version_stamp();
+  if (ret < 0) {
+    derr << __FUNC__ << ": write_version_stamp() failed: "
+	 << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+  // superblock
+  superblock.omap_backend = cct->_conf->filestore_omap_backend;
+  ret = write_superblock();
+  if (ret < 0) {
+    derr << __FUNC__ << ": write_superblock() failed: "
+	 << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+  struct statfs basefs;
+  ret = ::fstatfs(basedir_fd, &basefs);
+  if (ret < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": cannot fstatfs basedir "
+	 << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+#if defined(__linux__)
+  if (basefs.f_type == BTRFS_SUPER_MAGIC &&
+      !g_ceph_context->check_experimental_feature_enabled("btrfs")) {
+    derr << __FUNC__ << ": deprecated btrfs support is not enabled" << dendl;
+    goto close_fsid_fd;
+  }
+#endif
+
+  create_backend(basefs.f_type);
+
+  ret = backend->create_current();
+  if (ret < 0) {
+    derr << __FUNC__ << ": failed to create current/ " << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+  // write initial op_seq
+  {
+    uint64_t initial_seq = 0;
+    int fd = read_op_seq(&initial_seq);
+    if (fd < 0) {
+      ret = fd;
+      derr << __FUNC__ << ": failed to create " << current_op_seq_fn << ": "
+	   << cpp_strerror(ret) << dendl;
+      goto close_fsid_fd;
+    }
+    if (initial_seq == 0) {
+      ret = write_op_seq(fd, 1);
+      if (ret < 0) {
+	VOID_TEMP_FAILURE_RETRY(::close(fd));
+	derr << __FUNC__ << ": failed to write to " << current_op_seq_fn << ": "
+	     << cpp_strerror(ret) << dendl;
+	goto close_fsid_fd;
+      }
+
+      if (backend->can_checkpoint()) {
+	// create snap_1 too
+	current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC);
+	ceph_assert(current_fd >= 0);
+	char s[NAME_MAX];
+	snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull);
+	ret = backend->create_checkpoint(s, nullptr);
+	VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+	if (ret < 0 && ret != -EEXIST) {
+	  VOID_TEMP_FAILURE_RETRY(::close(fd));
+	  derr << __FUNC__ << ": failed to create snap_1: " << cpp_strerror(ret) << dendl;
+	  goto close_fsid_fd;
+	}
+      }
+    }
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+  }
+  ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir);
+  if (ret < 0) {
+    derr << __FUNC__ << ": failed to create " << cct->_conf->filestore_omap_backend << dendl;
+    goto close_fsid_fd;
+  }
+  // create fsid under omap
+  // open+lock fsid
+  int omap_fsid_fd;
+  char omap_fsid_fn[PATH_MAX];
+  snprintf(omap_fsid_fn, sizeof(omap_fsid_fn), "%s/osd_uuid", omap_dir.c_str());
+  omap_fsid_fd = ::open(omap_fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644);
+  if (omap_fsid_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open " << omap_fsid_fn << ": " << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+  if (read_fsid(omap_fsid_fd, &old_omap_fsid) < 0 || old_omap_fsid.is_zero()) {
+    ceph_assert(!fsid.is_zero());
+    fsid.print(fsid_str);
+    strcat(fsid_str, "\n");
+    ret = ::ftruncate(omap_fsid_fd, 0);
+    if (ret < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": failed to truncate fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_omap_fsid_fd;
+    }
+    ret = safe_write(omap_fsid_fd, fsid_str, strlen(fsid_str));
+    if (ret < 0) {
+      derr << __FUNC__ << ": failed to write fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_omap_fsid_fd;
+    }
+    dout(10) << __FUNC__ << ": write success, fsid:" << fsid_str << ", ret:" << ret << dendl;
+    if (::fsync(omap_fsid_fd) < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": close failed: can't write fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_omap_fsid_fd;
+    }
+    dout(10) << "mkfs omap fsid is " << fsid << dendl;
+  } else {
+    if (fsid != old_omap_fsid) {
+      derr << __FUNC__ << ": " << omap_fsid_fn
+           << " has existed omap fsid " << old_omap_fsid
+           << " != expected osd fsid " << fsid
+           << dendl;
+      ret = -EINVAL;
+      goto close_omap_fsid_fd;
+    }
+    dout(1) << __FUNC__ << ": omap fsid is already set to " << fsid << dendl;
+  }
+
+  dout(1) << cct->_conf->filestore_omap_backend << " db exists/created" << dendl;
+
+  // journal?
+  ret = mkjournal();
+  if (ret)
+    goto close_omap_fsid_fd;
+
+  ret = write_meta("type", "filestore");
+  if (ret)
+    goto close_omap_fsid_fd;
+
+  dout(1) << "mkfs done in " << basedir << dendl;
+  ret = 0;
+
+ close_omap_fsid_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd));
+ close_fsid_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+ close_basedir_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+  delete backend;
+  backend = nullptr;
+  return ret;
+}
+
+int FileStore::mkjournal()
+{
+  // read fsid
+  int ret;
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC, 0644);
+  if (fd < 0) {
+    int err = errno;
+    derr << __FUNC__ << ": open error: " << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  ret = read_fsid(fd, &fsid);
+  if (ret < 0) {
+    derr << __FUNC__ << ": read error: " << cpp_strerror(ret) << dendl;
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return ret;
+  }
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+  ret = 0;
+
+  new_journal();
+  if (journal) {
+    ret = journal->check();
+    if (ret < 0) {
+      ret = journal->create();
+      if (ret)
+	derr << __FUNC__ << ": error creating journal on " << journalpath
+		<< ": " << cpp_strerror(ret) << dendl;
+      else
+	dout(0) << __FUNC__ << ": created journal on " << journalpath << dendl;
+    }
+    delete journal;
+    journal = nullptr;
+  }
+  return ret;
+}
+
+int FileStore::read_fsid(int fd, uuid_d *uuid)
+{
+  char fsid_str[40];
+  memset(fsid_str, 0, sizeof(fsid_str));
+  int ret = safe_read(fd, fsid_str, sizeof(fsid_str));
+  if (ret < 0)
+    return ret;
+  if (ret == 8) {
+    // old 64-bit fsid... mirror it.
+    *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str;
+    *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str;
+    return 0;
+  }
+
+  if (ret > 36)
+    fsid_str[36] = 0;
+  else
+    fsid_str[ret] = 0;
+  if (!uuid->parse(fsid_str))
+    return -EINVAL;
+  return 0;
+}
+
+int FileStore::lock_fsid()
+{
+  struct flock l;
+  memset(&l, 0, sizeof(l));
+  l.l_type = F_WRLCK;
+  l.l_whence = SEEK_SET;
+  l.l_start = 0;
+  l.l_len = 0;
+  int r = ::fcntl(fsid_fd, F_SETLK, &l);
+  if (r < 0) {
+    int err = errno;
+    dout(0) << __FUNC__ << ": failed to lock " << basedir << "/fsid, is another ceph-osd still running? "
+	    << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  return 0;
+}
+
+bool FileStore::test_mount_in_use()
+{
+  dout(5) << __FUNC__ << ": basedir " << basedir << " journal " << journalpath << dendl;
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+
+  // verify fs isn't in use
+
+  fsid_fd = ::open(fn, O_RDWR|O_CLOEXEC, 0644);
+  if (fsid_fd < 0)
+    return 0;   // no fsid, ok.
+  bool inuse = lock_fsid() < 0;
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+  return inuse;
+}
+
+bool FileStore::is_rotational()
+{
+  bool rotational;
+  if (backend) {
+    rotational = backend->is_rotational();
+  } else {
+    int fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0)
+      return true;
+    struct statfs st;
+    int r = ::fstatfs(fd, &st);
+    ::close(fd);
+    if (r < 0) {
+      return true;
+    }
+    create_backend(st.f_type);
+    rotational = backend->is_rotational();
+    delete backend;
+    backend = nullptr;
+  }
+  dout(10) << __func__ << " " << (int)rotational << dendl;
+  return rotational;
+}
+
+bool FileStore::is_journal_rotational()
+{
+  bool journal_rotational;
+  if (backend) {
+    journal_rotational = backend->is_journal_rotational();
+  } else {
+    int fd = ::open(journalpath.c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0)
+      return true;
+    struct statfs st;
+    int r = ::fstatfs(fd, &st);
+    ::close(fd);
+    if (r < 0) {
+      return true;
+    }
+    create_backend(st.f_type);
+    journal_rotational = backend->is_journal_rotational();
+    delete backend;
+    backend = nullptr;
+  }
+  dout(10) << __func__ << " " << (int)journal_rotational << dendl;
+  return journal_rotational;
+}
+
+int FileStore::_detect_fs()
+{
+  struct statfs st;
+  int r = ::fstatfs(basedir_fd, &st);
+  if (r < 0)
+    return -errno;
+
+  blk_size = st.f_bsize;
+
+#if defined(__linux__)
+  if (st.f_type == BTRFS_SUPER_MAGIC &&
+      !g_ceph_context->check_experimental_feature_enabled("btrfs")) {
+    derr <<__FUNC__ << ": deprecated btrfs support is not enabled" << dendl;
+    return -EPERM;
+  }
+#endif
+
+  create_backend(st.f_type);
+
+  r = backend->detect_features();
+  if (r < 0) {
+    derr << __FUNC__ << ": detect_features error: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // vdo
+  {
+    char dev_node[PATH_MAX];
+    if (int rc = BlkDev{fsid_fd}.wholedisk(dev_node, PATH_MAX); rc == 0) {
+      vdo_fd = get_vdo_stats_handle(dev_node, &vdo_name);
+      if (vdo_fd >= 0) {
+	dout(0) << __func__ << " VDO volume " << vdo_name << " for " << dev_node
+		<< dendl;
+      }
+    }
+  }
+
+  // test xattrs
+  char fn[PATH_MAX];
+  int x = rand();
+  int y = x+1;
+  snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str());
+  int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0700);
+  if (tmpfd < 0) {
+    int ret = -errno;
+    derr << __FUNC__ << ": unable to create " << fn << ": " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x));
+  if (ret >= 0)
+    ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y));
+  if ((ret < 0) || (x != y)) {
+    derr << "Extended attributes don't appear to work. ";
+    if (ret)
+      *_dout << "Got error " + cpp_strerror(ret) + ". ";
+    *_dout << "If you are using ext3 or ext4, be sure to mount the underlying "
+	   << "file system with the 'user_xattr' option." << dendl;
+    ::unlink(fn);
+    VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+    return -ENOTSUP;
+  }
+
+  char buf[1000];
+  memset(buf, 0, sizeof(buf)); // shut up valgrind
+  chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf));
+  chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf));
+  chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf));
+  chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
+  ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
+  if (ret == -ENOSPC) {
+    dout(0) << "limited size xattrs" << dendl;
+  }
+  chain_fremovexattr(tmpfd, "user.test");
+  chain_fremovexattr(tmpfd, "user.test2");
+  chain_fremovexattr(tmpfd, "user.test3");
+  chain_fremovexattr(tmpfd, "user.test4");
+  chain_fremovexattr(tmpfd, "user.test5");
+
+  ::unlink(fn);
+  VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+
+  return 0;
+}
+
+int FileStore::_sanity_check_fs()
+{
+  // sanity check(s)
+
+  if (((int)m_filestore_journal_writeahead +
+      (int)m_filestore_journal_parallel +
+      (int)m_filestore_journal_trailing) > 1) {
+    dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl;
+    cerr << TEXT_RED
+	 << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n"
+	 << "             is enabled in ceph.conf.  You must choose a single journal mode."
+	 << TEXT_NORMAL << std::endl;
+    return -EINVAL;
+  }
+
+  if (!backend->can_checkpoint()) {
+    if (!journal || !m_filestore_journal_writeahead) {
+      dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl;
+      cerr << TEXT_RED
+	   << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n"
+	   << "             For non-btrfs volumes, a writeahead journal is required to\n"
+	   << "             maintain on-disk consistency in the event of a crash.  Your conf\n"
+	   << "             should include something like:\n"
+	   << "        osd journal = /path/to/journal_device_or_file\n"
+	   << "        filestore journal writeahead = true\n"
+	   << TEXT_NORMAL;
+    }
+  }
+
+  if (!journal) {
+    dout(0) << "mount WARNING: no journal" << dendl;
+    cerr << TEXT_YELLOW
+	 << " ** WARNING: No osd journal is configured: write latency may be high.\n"
+	 << "             If you will not be using an osd journal, write latency may be\n"
+	 << "             relatively high.  It can be reduced somewhat by lowering\n"
+	 << "             filestore_max_sync_interval, but lower values mean lower write\n"
+	 << "             throughput, especially with spinning disks.\n"
+	 << TEXT_NORMAL;
+  }
+
+  return 0;
+}
+
+int FileStore::write_superblock()
+{
+  bufferlist bl;
+  encode(superblock, bl);
+  return safe_write_file(basedir.c_str(), "superblock",
+			 bl.c_str(), bl.length(), 0600);
+}
+
+int FileStore::read_superblock()
+{
+  bufferptr bp(PATH_MAX);
+  int ret = safe_read_file(basedir.c_str(), "superblock",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      // If the file doesn't exist write initial CompatSet
+      return write_superblock();
+    }
+    return ret;
+  }
+
+  bufferlist bl;
+  bl.push_back(std::move(bp));
+  auto i = bl.cbegin();
+  decode(superblock, i);
+  return 0;
+}
+
+int FileStore::update_version_stamp()
+{
+  return write_version_stamp();
+}
+
+int FileStore::version_stamp_is_valid(uint32_t *version)
+{
+  bufferptr bp(PATH_MAX);
+  int ret = safe_read_file(basedir.c_str(), "store_version",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    return ret;
+  }
+  bufferlist bl;
+  bl.push_back(std::move(bp));
+  auto i = bl.cbegin();
+  decode(*version, i);
+  dout(10) << __FUNC__ << ": was " << *version << " vs target "
+	   << target_version << dendl;
+  if (*version == target_version)
+    return 1;
+  else
+    return 0;
+}
+
+int FileStore::flush_cache(ostream *os)
+{
+  string drop_caches_file = "/proc/sys/vm/drop_caches";
+  int drop_caches_fd = ::open(drop_caches_file.c_str(), O_WRONLY|O_CLOEXEC), ret = 0;
+  char buf[2] = "3";
+  size_t len = strlen(buf);
+
+  if (drop_caches_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open " << drop_caches_file << ": " << cpp_strerror(ret) << dendl;
+    if (os) {
+      *os << "FileStore flush_cache: failed to open " << drop_caches_file << ": " << cpp_strerror(ret);
+    }
+    return ret;
+  }
+
+  if (::write(drop_caches_fd, buf, len) < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to write to " << drop_caches_file << ": " << cpp_strerror(ret) << dendl;
+    if (os) {
+      *os << "FileStore flush_cache: failed to write to " << drop_caches_file << ": " << cpp_strerror(ret);
+    }
+    goto out;
+  }
+
+out:
+  ::close(drop_caches_fd);
+  return ret;
+}
+
+int FileStore::write_version_stamp()
+{
+  dout(1) << __FUNC__ << ": " << target_version << dendl;
+  bufferlist bl;
+  encode(target_version, bl);
+
+  return safe_write_file(basedir.c_str(), "store_version",
+			 bl.c_str(), bl.length(), 0600);
+}
+
+int FileStore::upgrade()
+{
+  dout(1) << __FUNC__ << dendl;
+  uint32_t version;
+  int r = version_stamp_is_valid(&version);
+
+  if (r == -ENOENT) {
+      derr << "The store_version file doesn't exist." << dendl;
+      return -EINVAL;
+  }
+  if (r < 0)
+    return r;
+  if (r == 1)
+    return 0;
+
+  if (version < 3) {
+    derr << "ObjectStore is old at version " << version << ".  Please upgrade to firefly v0.80.x, convert your store, and then upgrade."  << dendl;
+    return -EINVAL;
+  }
+
+  // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to
+  // open up DBObjectMap with the do_upgrade flag, which we already did.
+  update_version_stamp();
+  return 0;
+}
+
+int FileStore::read_op_seq(uint64_t *seq)
+{
+  int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR|O_CLOEXEC, 0644);
+  if (op_fd < 0) {
+    int r = -errno;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  char s[40];
+  memset(s, 0, sizeof(s));
+  int ret = safe_read(op_fd, s, sizeof(s) - 1);
+  if (ret < 0) {
+    derr << __FUNC__ << ": error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl;
+    VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+    ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+    return ret;
+  }
+  *seq = atoll(s);
+  return op_fd;
+}
+
+int FileStore::write_op_seq(int fd, uint64_t seq)
+{
+  char s[30];
+  snprintf(s, sizeof(s), "%" PRId64 "\n", seq);
+  int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0));
+  if (ret < 0) {
+    ret = -errno;
+    ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+  }
+  return ret;
+}
+
+int FileStore::mount()
+{
+  int ret;
+  char buf[PATH_MAX];
+  uint64_t initial_op_seq;
+  uuid_d omap_fsid;
+  set<string> cluster_snaps;
+  CompatSet supported_compat_set = get_fs_supported_compat_set();
+
+  dout(5) << "basedir " << basedir << " journal " << journalpath << dendl;
+
+  ret = set_throttle_params();
+  if (ret != 0)
+    goto done;
+
+  // make sure global base dir exists
+  if (::access(basedir.c_str(), R_OK | W_OK)) {
+    ret = -errno;
+    derr << __FUNC__ << ": unable to access basedir '" << basedir << "': "
+	 << cpp_strerror(ret) << dendl;
+    goto done;
+  }
+
+  // get fsid
+  snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str());
+  fsid_fd = ::open(buf, O_RDWR|O_CLOEXEC, 0644);
+  if (fsid_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": error opening '" << buf << "': "
+	 << cpp_strerror(ret) << dendl;
+    goto done;
+  }
+
+  ret = read_fsid(fsid_fd, &fsid);
+  if (ret < 0) {
+    derr << __FUNC__ << ": error reading fsid_fd: " << cpp_strerror(ret)
+	 << dendl;
+    goto close_fsid_fd;
+  }
+
+  if (lock_fsid() < 0) {
+    derr << __FUNC__ << ": lock_fsid failed" << dendl;
+    ret = -EBUSY;
+    goto close_fsid_fd;
+  }
+
+  dout(10) << "mount fsid is " << fsid << dendl;
+
+
+  uint32_t version_stamp;
+  ret = version_stamp_is_valid(&version_stamp);
+  if (ret < 0) {
+    derr << __FUNC__ << ": error in version_stamp_is_valid: "
+	 << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  } else if (ret == 0) {
+    if (do_update || (int)version_stamp < cct->_conf->filestore_update_to) {
+      derr << __FUNC__ << ": stale version stamp detected: "
+	   << version_stamp
+	   << ". Proceeding, do_update "
+	   << "is set, performing disk format upgrade."
+	   << dendl;
+      do_update = true;
+    } else {
+      ret = -EINVAL;
+      derr << __FUNC__ << ": stale version stamp " << version_stamp
+	   << ". Please run the FileStore update script before starting the "
+	   << "OSD, or set filestore_update_to to " << target_version
+	   << " (currently " << cct->_conf->filestore_update_to << ")"
+	   << dendl;
+      goto close_fsid_fd;
+    }
+  }
+
+  ret = read_superblock();
+  if (ret < 0) {
+    goto close_fsid_fd;
+  }
+
+  // Check if this FileStore supports all the necessary features to mount
+  if (supported_compat_set.compare(superblock.compat_features) == -1) {
+    derr << __FUNC__ << ": Incompatible features set "
+	   << superblock.compat_features << dendl;
+    ret = -EINVAL;
+    goto close_fsid_fd;
+  }
+
+  // open some dir handles
+  basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+  if (basedir_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open " << basedir << ": "
+	 << cpp_strerror(ret) << dendl;
+    basedir_fd = -1;
+    goto close_fsid_fd;
+  }
+
+  // test for btrfs, xattrs, etc.
+  ret = _detect_fs();
+  if (ret < 0) {
+    derr << __FUNC__ << ": error in _detect_fs: "
+	 << cpp_strerror(ret) << dendl;
+    goto close_basedir_fd;
+  }
+
+  {
+    list<string> ls;
+    ret = backend->list_checkpoints(ls);
+    if (ret < 0) {
+      derr << __FUNC__ << ": error in _list_snaps: "<< cpp_strerror(ret) << dendl;
+      goto close_basedir_fd;
+    }
+
+    long long unsigned c, prev = 0;
+    char clustersnap[NAME_MAX];
+    for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) {
+      if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) {
+	ceph_assert(c > prev);
+	prev = c;
+	snaps.push_back(c);
+      } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1)
+	cluster_snaps.insert(*it);
+    }
+  }
+
+  if (m_osd_rollback_to_cluster_snap.length() &&
+      cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) {
+    derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl;
+    ret = -ENOENT;
+    goto close_basedir_fd;
+  }
+
+  char nosnapfn[200];
+  snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str());
+
+  if (backend->can_checkpoint()) {
+    if (snaps.empty()) {
+      dout(0) << __FUNC__ << ": WARNING: no consistent snaps found, store may be in inconsistent state" << dendl;
+    } else {
+      char s[NAME_MAX];
+      uint64_t curr_seq = 0;
+
+      if (m_osd_rollback_to_cluster_snap.length()) {
+	derr << TEXT_RED
+	     << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **"
+	     << TEXT_NORMAL
+	     << dendl;
+	ceph_assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap));
+	snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str());
+      } else {
+	{
+	  int fd = read_op_seq(&curr_seq);
+	  if (fd >= 0) {
+	    VOID_TEMP_FAILURE_RETRY(::close(fd));
+	  }
+	}
+	if (curr_seq)
+	  dout(10) << " current/ seq was " << curr_seq << dendl;
+	else
+	  dout(10) << " current/ missing entirely (unusual, but okay)" << dendl;
+
+	uint64_t cp = snaps.back();
+	dout(10) << " most recent snap from " << snaps << " is " << cp << dendl;
+
+	// if current/ is marked as non-snapshotted, refuse to roll
+	// back (without clear direction) to avoid throwing out new
+	// data.
+	struct stat st;
+	if (::stat(nosnapfn, &st) == 0) {
+	  if (!m_osd_use_stale_snap) {
+	    derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl;
+	    derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl;
+	    derr << "config option for --osd-use-stale-snap startup argument." << dendl;
+	    ret = -ENOTSUP;
+	    goto close_basedir_fd;
+	  }
+	  derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq
+	       << ", newest snap is " << cp << dendl;
+	  cerr << TEXT_YELLOW
+	       << " ** WARNING: forcing the use of stale snapshot data **"
+	       << TEXT_NORMAL << std::endl;
+	}
+
+        dout(10) << __FUNC__ << ": rolling back to consistent snap " << cp << dendl;
+	snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+      }
+
+      // drop current?
+      ret = backend->rollback_to(s);
+      if (ret) {
+	derr << __FUNC__ << ": error rolling back to " << s << ": "
+	     << cpp_strerror(ret) << dendl;
+	goto close_basedir_fd;
+      }
+    }
+  }
+  initial_op_seq = 0;
+
+  current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC);
+  if (current_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl;
+    goto close_basedir_fd;
+  }
+
+  ceph_assert(current_fd >= 0);
+
+  op_fd = read_op_seq(&initial_op_seq);
+  if (op_fd < 0) {
+    ret = op_fd;
+    derr << __FUNC__ << ": read_op_seq failed" << dendl;
+    goto close_current_fd;
+  }
+
+  dout(5) << "mount op_seq is " << initial_op_seq << dendl;
+  if (initial_op_seq == 0) {
+    derr << "mount initial op seq is 0; something is wrong" << dendl;
+    ret = -EINVAL;
+    goto close_current_fd;
+  }
+
+  if (!backend->can_checkpoint()) {
+    // mark current/ as non-snapshotted so that we don't rollback away
+    // from it.
+    int r = ::creat(nosnapfn, 0644);
+    if (r < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": failed to create current/nosnap" << dendl;
+      goto close_current_fd;
+    }
+    VOID_TEMP_FAILURE_RETRY(::close(r));
+  } else {
+    // clear nosnap marker, if present.
+    ::unlink(nosnapfn);
+  }
+
+  // check fsid with omap
+  // get omap fsid
+  char omap_fsid_buf[PATH_MAX];
+  struct ::stat omap_fsid_stat;
+  snprintf(omap_fsid_buf, sizeof(omap_fsid_buf), "%s/osd_uuid", omap_dir.c_str());
+  // if osd_uuid not exists, assume as this omap matchs corresponding osd
+  if (::stat(omap_fsid_buf, &omap_fsid_stat) != 0){
+    dout(10) << __FUNC__ << ": osd_uuid not found under omap, "
+             << "assume as matched."
+             << dendl;
+  } else {
+    int omap_fsid_fd;
+    // if osd_uuid exists, compares osd_uuid with fsid
+    omap_fsid_fd = ::open(omap_fsid_buf, O_RDONLY|O_CLOEXEC, 0644);
+    if (omap_fsid_fd < 0) {
+        ret = -errno;
+        derr << __FUNC__ << ": error opening '" << omap_fsid_buf << "': "
+             << cpp_strerror(ret)
+             << dendl;
+        goto close_current_fd;
+    }
+    ret = read_fsid(omap_fsid_fd, &omap_fsid);
+    VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd));
+    if (ret < 0) {
+      derr << __FUNC__ << ": error reading omap_fsid_fd"
+           << ", omap_fsid = " << omap_fsid
+           << cpp_strerror(ret)
+           << dendl;
+      goto close_current_fd;
+    }
+    if (fsid != omap_fsid) {
+      derr << __FUNC__ << ": " << omap_fsid_buf
+           << " has existed omap fsid " << omap_fsid
+           << " != expected osd fsid " << fsid
+           << dendl;
+      ret = -EINVAL;
+      goto close_current_fd;
+    }
+  }
+
+  dout(0) << "start omap initiation" << dendl;
+  if (!(generic_flags & SKIP_MOUNT_OMAP)) {
+    KeyValueDB * omap_store = KeyValueDB::create(cct,
+						 superblock.omap_backend,
+						 omap_dir);
+    if (!omap_store)
+    {
+      derr << __FUNC__ << ": Error creating " << superblock.omap_backend << dendl;
+      ret = -1;
+      goto close_current_fd;
+    }
+
+    if (superblock.omap_backend == "rocksdb")
+      ret = omap_store->init(cct->_conf->filestore_rocksdb_options);
+    else
+      ret = omap_store->init();
+
+    if (ret < 0) {
+      derr << __FUNC__ << ": Error initializing omap_store: " << cpp_strerror(ret) << dendl;
+      goto close_current_fd;
+    }
+
+    stringstream err;
+    if (omap_store->create_and_open(err)) {
+      delete omap_store;
+      omap_store = nullptr;
+      derr << __FUNC__ << ": Error initializing " << superblock.omap_backend
+	   << " : " << err.str() << dendl;
+      ret = -1;
+      goto close_current_fd;
+    }
+
+    DBObjectMap *dbomap = new DBObjectMap(cct, omap_store);
+    ret = dbomap->init(do_update);
+    if (ret < 0) {
+      delete dbomap;
+      dbomap = nullptr;
+      derr << __FUNC__ << ": Error initializing DBObjectMap: " << ret << dendl;
+      goto close_current_fd;
+    }
+    stringstream err2;
+
+    if (cct->_conf->filestore_debug_omap_check && !dbomap->check(err2)) {
+      derr << err2.str() << dendl;
+      delete dbomap;
+      dbomap = nullptr;
+      ret = -EINVAL;
+      goto close_current_fd;
+    }
+    object_map.reset(dbomap);
+  }
+
+  // journal
+  new_journal();
+
+  // select journal mode?
+  if (journal) {
+    if (!m_filestore_journal_writeahead &&
+	!m_filestore_journal_parallel &&
+	!m_filestore_journal_trailing) {
+      if (!backend->can_checkpoint()) {
+	m_filestore_journal_writeahead = true;
+	dout(0) << __FUNC__ << ": enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl;
+      } else {
+	m_filestore_journal_parallel = true;
+	dout(0) << __FUNC__ << ": enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl;
+      }
+    } else {
+      if (m_filestore_journal_writeahead)
+	dout(0) << __FUNC__ << ": WRITEAHEAD journal mode explicitly enabled in conf" << dendl;
+      if (m_filestore_journal_parallel)
+	dout(0) << __FUNC__ << ": PARALLEL journal mode explicitly enabled in conf" << dendl;
+      if (m_filestore_journal_trailing)
+	dout(0) << __FUNC__ << ": TRAILING journal mode explicitly enabled in conf" << dendl;
+    }
+    if (m_filestore_journal_writeahead)
+      journal->set_wait_on_full(true);
+  } else {
+    dout(0) << __FUNC__ << ": no journal" << dendl;
+  }
+
+  ret = _sanity_check_fs();
+  if (ret) {
+    derr << __FUNC__ << ": _sanity_check_fs failed with error "
+	 << ret << dendl;
+    goto close_current_fd;
+  }
+
+  // Cleanup possibly invalid collections
+  {
+    vector<coll_t> collections;
+    ret = list_collections(collections, true);
+    if (ret < 0) {
+      derr << "Error " << ret << " while listing collections" << dendl;
+      goto close_current_fd;
+    }
+    for (vector<coll_t>::iterator i = collections.begin();
+	 i != collections.end();
+	 ++i) {
+      Index index;
+      ret = get_index(*i, &index);
+      if (ret < 0) {
+	derr << "Unable to mount index " << *i
+	     << " with error: " << ret << dendl;
+	goto close_current_fd;
+      }
+      ceph_assert(index.index);
+      RWLock::WLocker l((index.index)->access_lock);
+
+      index->cleanup();
+    }
+  }
+  if (!m_disable_wbthrottle) {
+    wbthrottle.start();
+  } else {
+    dout(0) << __FUNC__ << ": INFO: WbThrottle is disabled" << dendl;
+    if (cct->_conf->filestore_odsync_write) {
+      dout(0) << __FUNC__ << ": INFO: O_DSYNC write is enabled" << dendl;
+    }
+  }
+  sync_thread.create("filestore_sync");
+
+  if (!(generic_flags & SKIP_JOURNAL_REPLAY)) {
+    ret = journal_replay(initial_op_seq);
+    if (ret < 0) {
+      derr << __FUNC__ << ": failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl;
+      if (ret == -ENOTTY) {
+        derr << "maybe journal is not pointing to a block device and its size "
+	     << "wasn't configured?" << dendl;
+      }
+
+      goto stop_sync;
+    }
+  }
+
+  {
+    stringstream err2;
+    if (cct->_conf->filestore_debug_omap_check && !object_map->check(err2)) {
+      derr << err2.str() << dendl;
+      ret = -EINVAL;
+      goto stop_sync;
+    }
+  }
+
+  init_temp_collections();
+
+  journal_start();
+
+  op_tp.start();
+  for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+    (*it)->start();
+  }
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    (*it)->start();
+  }
+
+  timer.init();
+
+  // upgrade?
+  if (cct->_conf->filestore_update_to >= (int)get_target_version()) {
+    int err = upgrade();
+    if (err < 0) {
+      derr << "error converting store" << dendl;
+      umount();
+      return err;
+    }
+  }
+
+  // all okay.
+  return 0;
+
+stop_sync:
+  // stop sync thread
+  lock.Lock();
+  stop = true;
+  sync_cond.Signal();
+  lock.Unlock();
+  sync_thread.join();
+  if (!m_disable_wbthrottle) {
+    wbthrottle.stop();
+  }
+close_current_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+  current_fd = -1;
+close_basedir_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+  basedir_fd = -1;
+close_fsid_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+done:
+  ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+  delete backend;
+  backend = nullptr;
+  object_map.reset();
+  return ret;
+}
+
+void FileStore::init_temp_collections()
+{
+  dout(10) << __FUNC__ << dendl;
+  vector<coll_t> ls;
+  int r = list_collections(ls, true);
+  ceph_assert(r >= 0);
+
+  dout(20) << " ls " << ls << dendl;
+
+  SequencerPosition spos;
+
+  set<coll_t> temps;
+  for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p)
+    if (p->is_temp())
+      temps.insert(*p);
+  dout(20) << " temps " << temps << dendl;
+
+  for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+    if (p->is_temp())
+      continue;
+    coll_map[*p] = new OpSequencer(cct, ++next_osr_id, *p);
+    if (p->is_meta())
+      continue;
+    coll_t temp = p->get_temp();
+    if (temps.count(temp)) {
+      temps.erase(temp);
+    } else {
+      dout(10) << __FUNC__ << ": creating " << temp << dendl;
+      r = _create_collection(temp, 0, spos);
+      ceph_assert(r == 0);
+    }
+  }
+
+  for (set<coll_t>::iterator p = temps.begin(); p != temps.end(); ++p) {
+    dout(10) << __FUNC__ << ": removing stray " << *p << dendl;
+    r = _collection_remove_recursive(*p, spos);
+    ceph_assert(r == 0);
+  }
+}
+
+int FileStore::umount()
+{
+  dout(5) << __FUNC__ << ": " << basedir << dendl;
+
+  flush();
+  sync();
+  do_force_sync();
+
+  {
+    Mutex::Locker l(coll_lock);
+    coll_map.clear();
+  }
+
+  lock.Lock();
+  stop = true;
+  sync_cond.Signal();
+  lock.Unlock();
+  sync_thread.join();
+  if (!m_disable_wbthrottle){
+    wbthrottle.stop();
+  }
+  op_tp.stop();
+
+  journal_stop();
+  if (!(generic_flags & SKIP_JOURNAL_REPLAY))
+    journal_write_close();
+
+  for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+    (*it)->stop();
+  }
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    (*it)->stop();
+  }
+
+  if (vdo_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
+    vdo_fd = -1;
+  }
+  if (fsid_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+    fsid_fd = -1;
+  }
+  if (op_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+    op_fd = -1;
+  }
+  if (current_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+    current_fd = -1;
+  }
+  if (basedir_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+    basedir_fd = -1;
+  }
+
+  force_sync = false;
+
+  delete backend;
+  backend = nullptr;
+
+  object_map.reset();
+
+  {
+    Mutex::Locker l(sync_entry_timeo_lock);
+    timer.shutdown();
+  }
+
+  // nothing
+  return 0;
+}
+
+
+/// -----------------------------
+
+// keep OpSequencer handles alive for all time so that a sequence
+// that removes a collection and creates a new one will not allow
+// two sequencers for the same collection to be alive at once.
+
+ObjectStore::CollectionHandle FileStore::open_collection(const coll_t& c)
+{
+  Mutex::Locker l(coll_lock);
+  auto p = coll_map.find(c);
+  if (p == coll_map.end()) {
+    return CollectionHandle();
+  }
+  return p->second;
+}
+
+ObjectStore::CollectionHandle FileStore::create_new_collection(const coll_t& c)
+{
+  Mutex::Locker l(coll_lock);
+  auto p = coll_map.find(c);
+  if (p == coll_map.end()) {
+    auto *r = new OpSequencer(cct, ++next_osr_id, c);
+    coll_map[c] = r;
+    return r;
+  } else {
+    return p->second;
+  }
+}
+
+
+/// -----------------------------
+
+FileStore::Op *FileStore::build_op(vector<Transaction>& tls,
+				   Context *onreadable,
+				   Context *onreadable_sync,
+				   TrackedOpRef osd_op)
+{
+  uint64_t bytes = 0, ops = 0;
+  for (vector<Transaction>::iterator p = tls.begin();
+       p != tls.end();
+       ++p) {
+    bytes += (*p).get_num_bytes();
+    ops += (*p).get_num_ops();
+  }
+
+  Op *o = new Op;
+  o->start = ceph_clock_now();
+  o->tls = std::move(tls);
+  o->onreadable = onreadable;
+  o->onreadable_sync = onreadable_sync;
+  o->ops = ops;
+  o->bytes = bytes;
+  o->osd_op = osd_op;
+  return o;
+}
+
+
+
+void FileStore::queue_op(OpSequencer *osr, Op *o)
+{
+  // queue op on sequencer, then queue sequencer for the threadpool,
+  // so that regardless of which order the threads pick up the
+  // sequencer, the op order will be preserved.
+
+  osr->queue(o);
+  o->trace.event("queued");
+
+  logger->inc(l_filestore_ops);
+  logger->inc(l_filestore_bytes, o->bytes);
+
+  dout(5) << __FUNC__ << ": " << o << " seq " << o->op
+	  << " " << *osr
+	  << " " << o->bytes << " bytes"
+	  << "   (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)"
+	  << dendl;
+  op_wq.queue(osr);
+}
+
+void FileStore::op_queue_reserve_throttle(Op *o)
+{
+  throttle_ops.get();
+  throttle_bytes.get(o->bytes);
+
+  logger->set(l_filestore_op_queue_ops, throttle_ops.get_current());
+  logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::op_queue_release_throttle(Op *o)
+{
+  throttle_ops.put();
+  throttle_bytes.put(o->bytes);
+  logger->set(l_filestore_op_queue_ops, throttle_ops.get_current());
+  logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
+{
+  if (!m_disable_wbthrottle) {
+    wbthrottle.throttle();
+  }
+  // inject a stall?
+  if (cct->_conf->filestore_inject_stall) {
+    int orig = cct->_conf->filestore_inject_stall;
+    dout(5) << __FUNC__ << ": filestore_inject_stall " << orig << ", sleeping" << dendl;
+    sleep(orig);
+    cct->_conf.set_val("filestore_inject_stall", "0");
+    dout(5) << __FUNC__ << ": done stalling" << dendl;
+  }
+
+  osr->apply_lock.Lock();
+  Op *o = osr->peek_queue();
+  o->trace.event("op_apply_start");
+  apply_manager.op_apply_start(o->op);
+  dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " start" << dendl;
+  o->trace.event("_do_transactions start");
+  int r = _do_transactions(o->tls, o->op, &handle, osr->osr_name);
+  o->trace.event("op_apply_finish");
+  apply_manager.op_apply_finish(o->op);
+  dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " r = " << r
+	   << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl;
+}
+
+void FileStore::_finish_op(OpSequencer *osr)
+{
+  list<Context*> to_queue;
+  Op *o = osr->dequeue(&to_queue);
+
+  o->tls.clear();
+
+  utime_t lat = ceph_clock_now();
+  lat -= o->start;
+
+  dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " lat " << lat << dendl;
+  osr->apply_lock.Unlock();  // locked in _do_op
+  o->trace.event("_finish_op");
+
+  // called with tp lock held
+  op_queue_release_throttle(o);
+
+  logger->tinc(l_filestore_apply_latency, lat);
+
+  if (o->onreadable_sync) {
+    o->onreadable_sync->complete(0);
+  }
+  if (o->onreadable) {
+    apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable);
+  }
+  if (!to_queue.empty()) {
+    apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue);
+  }
+  delete o;
+  o = nullptr;
+}
+
+struct C_JournaledAhead : public Context {
+  FileStore *fs;
+  FileStore::OpSequencer *osr;
+  FileStore::Op *o;
+  Context *ondisk;
+
+  C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk):
+    fs(f), osr(os), o(o), ondisk(ondisk) { }
+  void finish(int r) override {
+    fs->_journaled_ahead(osr, o, ondisk);
+  }
+};
+
+int FileStore::queue_transactions(CollectionHandle& ch, vector<Transaction>& tls,
+				  TrackedOpRef osd_op,
+				  ThreadPool::TPHandle *handle)
+{
+  Context *onreadable;
+  Context *ondisk;
+  Context *onreadable_sync;
+  ObjectStore::Transaction::collect_contexts(
+    tls, &onreadable, &ondisk, &onreadable_sync);
+
+  if (cct->_conf->objectstore_blackhole) {
+    dout(0) << __FUNC__ << ": objectstore_blackhole = TRUE, dropping transaction"
+	    << dendl;
+    delete ondisk;
+    ondisk = nullptr;
+    delete onreadable;
+    onreadable = nullptr;
+    delete onreadable_sync;
+    onreadable_sync = nullptr;
+    return 0;
+  }
+
+  utime_t start = ceph_clock_now();
+
+  OpSequencer *osr = static_cast<OpSequencer*>(ch.get());
+  dout(5) << __FUNC__ << ": osr " << osr << " " << *osr << dendl;
+
+  ZTracer::Trace trace;
+  if (osd_op && osd_op->pg_trace) {
+    osd_op->store_trace.init("filestore op", &trace_endpoint, &osd_op->pg_trace);
+    trace = osd_op->store_trace;
+  }
+
+  if (journal && journal->is_writeable() && !m_filestore_journal_trailing) {
+    Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+
+    //prepare and encode transactions data out of lock
+    bufferlist tbl;
+    int orig_len = journal->prepare_entry(o->tls, &tbl);
+
+    if (handle)
+      handle->suspend_tp_timeout();
+
+    op_queue_reserve_throttle(o);
+    journal->reserve_throttle_and_backoff(tbl.length());
+
+    if (handle)
+      handle->reset_tp_timeout();
+
+    uint64_t op_num = submit_manager.op_submit_start();
+    o->op = op_num;
+    trace.keyval("opnum", op_num);
+
+    if (m_filestore_do_dump)
+      dump_transactions(o->tls, o->op, osr);
+
+    if (m_filestore_journal_parallel) {
+      dout(5) << __FUNC__ << ": (parallel) " << o->op << " " << o->tls << dendl;
+
+      trace.keyval("journal mode", "parallel");
+      trace.event("journal started");
+      _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op);
+
+      // queue inside submit_manager op submission lock
+      queue_op(osr, o);
+      trace.event("op queued");
+    } else if (m_filestore_journal_writeahead) {
+      dout(5) << __FUNC__ << ": (writeahead) " << o->op << " " << o->tls << dendl;
+
+      osr->queue_journal(o);
+
+      trace.keyval("journal mode", "writeahead");
+      trace.event("journal started");
+      _op_journal_transactions(tbl, orig_len, o->op,
+			       new C_JournaledAhead(this, osr, o, ondisk),
+			       osd_op);
+    } else {
+      ceph_abort();
+    }
+    submit_manager.op_submit_finish(op_num);
+    utime_t end = ceph_clock_now();
+    logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+    return 0;
+  }
+
+  if (!journal) {
+    Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+    dout(5) << __FUNC__ << ": (no journal) " << o << " " << tls << dendl;
+
+    if (handle)
+      handle->suspend_tp_timeout();
+
+    op_queue_reserve_throttle(o);
+
+    if (handle)
+      handle->reset_tp_timeout();
+
+    uint64_t op_num = submit_manager.op_submit_start();
+    o->op = op_num;
+
+    if (m_filestore_do_dump)
+      dump_transactions(o->tls, o->op, osr);
+
+    queue_op(osr, o);
+    trace.keyval("opnum", op_num);
+    trace.keyval("journal mode", "none");
+    trace.event("op queued");
+
+    if (ondisk)
+      apply_manager.add_waiter(op_num, ondisk);
+    submit_manager.op_submit_finish(op_num);
+    utime_t end = ceph_clock_now();
+    logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+    return 0;
+  }
+
+  ceph_assert(journal);
+  //prepare and encode transactions data out of lock
+  bufferlist tbl;
+  int orig_len = -1;
+  if (journal->is_writeable()) {
+    orig_len = journal->prepare_entry(tls, &tbl);
+  }
+  uint64_t op = submit_manager.op_submit_start();
+  dout(5) << __FUNC__ << ": (trailing journal) " << op << " " << tls << dendl;
+
+  if (m_filestore_do_dump)
+    dump_transactions(tls, op, osr);
+
+  trace.event("op_apply_start");
+  trace.keyval("opnum", op);
+  trace.keyval("journal mode", "trailing");
+  apply_manager.op_apply_start(op);
+  trace.event("do_transactions");
+  int r = do_transactions(tls, op);
+
+  if (r >= 0) {
+    trace.event("journal started");
+    _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op);
+  } else {
+    delete ondisk;
+    ondisk = nullptr;
+  }
+
+  // start on_readable finisher after we queue journal item, as on_readable callback
+  // is allowed to delete the Transaction
+  if (onreadable_sync) {
+    onreadable_sync->complete(r);
+  }
+  apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r);
+
+  submit_manager.op_submit_finish(op);
+  trace.event("op_apply_finish");
+  apply_manager.op_apply_finish(op);
+
+  utime_t end = ceph_clock_now();
+  logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+  return r;
+}
+
+void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
+{
+  dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl;
+
+  o->trace.event("writeahead journal finished");
+
+  // this should queue in order because the journal does it's completions in order.
+  queue_op(osr, o);
+
+  list<Context*> to_queue;
+  osr->dequeue_journal(&to_queue);
+
+  // do ondisk completions async, to prevent any onreadable_sync completions
+  // getting blocked behind an ondisk completion.
+  if (ondisk) {
+    dout(10) << " queueing ondisk " << ondisk << dendl;
+    ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk);
+  }
+  if (!to_queue.empty()) {
+    ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue);
+  }
+}
+
+int FileStore::_do_transactions(
+  vector<Transaction> &tls,
+  uint64_t op_seq,
+  ThreadPool::TPHandle *handle,
+  const char *osr_name)
+{
+  int trans_num = 0;
+
+  for (vector<Transaction>::iterator p = tls.begin();
+       p != tls.end();
+       ++p, trans_num++) {
+    _do_transaction(*p, op_seq, trans_num, handle, osr_name);
+    if (handle)
+      handle->reset_tp_timeout();
+  }
+
+  return 0;
+}
+
+void FileStore::_set_global_replay_guard(const coll_t& cid,
+					 const SequencerPosition &spos)
+{
+  if (backend->can_checkpoint())
+    return;
+
+  // sync all previous operations on this sequencer
+  int ret = object_map->sync();
+  if (ret < 0) {
+    derr << __FUNC__ << ": omap sync error " << cpp_strerror(ret) << dendl;
+    ceph_abort_msg("_set_global_replay_guard failed");
+  }
+  ret = sync_filesystem(basedir_fd);
+  if (ret < 0) {
+    derr << __FUNC__ << ": sync_filesystem error " << cpp_strerror(ret) << dendl;
+    ceph_abort_msg("_set_global_replay_guard failed");
+  }
+
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    int err = errno;
+    derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+    ceph_abort_msg("_set_global_replay_guard failed");
+  }
+
+  _inject_failure();
+
+  // then record that we did it
+  bufferlist v;
+  encode(spos, v);
+  int r = chain_fsetxattr<true, true>(
+    fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length());
+  if (r < 0) {
+    derr << __FUNC__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR
+	 << " got " << cpp_strerror(r) << dendl;
+    ceph_abort_msg("fsetxattr failed");
+  }
+
+  // and make sure our xattr is durable.
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  _inject_failure();
+
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+int FileStore::_check_global_replay_guard(const coll_t& cid,
+					  const SequencerPosition& spos)
+{
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    dout(10) << __FUNC__ << ": " << cid << " dne" << dendl;
+    return 1;  // if collection does not exist, there is no guard, and we can replay.
+  }
+
+  char buf[100];
+  int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf));
+  if (r < 0) {
+    dout(20) << __FUNC__ << ": no xattr" << dendl;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return 1;  // no xattr
+  }
+  bufferlist bl;
+  bl.append(buf, r);
+
+  SequencerPosition opos;
+  auto p = bl.cbegin();
+  decode(opos, p);
+
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  return spos >= opos ? 1 : -1;
+}
+
+
+void FileStore::_set_replay_guard(const coll_t& cid,
+                                  const SequencerPosition &spos,
+                                  bool in_progress=false)
+{
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    int err = errno;
+    derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+    ceph_abort_msg("_set_replay_guard failed");
+  }
+  _set_replay_guard(fd, spos, 0, in_progress);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+
+void FileStore::_set_replay_guard(int fd,
+				  const SequencerPosition& spos,
+				  const ghobject_t *hoid,
+				  bool in_progress)
+{
+  if (backend->can_checkpoint())
+    return;
+
+  dout(10) << __FUNC__ << ": " << spos << (in_progress ? " START" : "") << dendl;
+
+  _inject_failure();
+
+  // first make sure the previous operation commits
+  int r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  if (!in_progress) {
+    // sync object_map too.  even if this object has a header or keys,
+    // it have had them in the past and then removed them, so always
+    // sync.
+    object_map->sync(hoid, &spos);
+  }
+
+  _inject_failure();
+
+  // then record that we did it
+  bufferlist v(40);
+  encode(spos, v);
+  encode(in_progress, v);
+  r = chain_fsetxattr<true, true>(
+    fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+  if (r < 0) {
+    derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+    ceph_abort_msg("fsetxattr failed");
+  }
+
+  // and make sure our xattr is durable.
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  _inject_failure();
+
+  dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+void FileStore::_close_replay_guard(const coll_t& cid,
+                                    const SequencerPosition &spos)
+{
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    int err = errno;
+    derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+    ceph_abort_msg("_close_replay_guard failed");
+  }
+  _close_replay_guard(fd, spos);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos,
+				    const ghobject_t *hoid)
+{
+  if (backend->can_checkpoint())
+    return;
+
+  dout(10) << __FUNC__ << ": " << spos << dendl;
+
+  _inject_failure();
+
+  // sync object_map too.  even if this object has a header or keys,
+  // it have had them in the past and then removed them, so always
+  // sync.
+  object_map->sync(hoid, &spos);
+
+  // then record that we are done with this operation
+  bufferlist v(40);
+  encode(spos, v);
+  bool in_progress = false;
+  encode(in_progress, v);
+  int r = chain_fsetxattr<true, true>(
+    fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+  if (r < 0) {
+    derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+    ceph_abort_msg("fsetxattr failed");
+  }
+
+  // and make sure our xattr is durable.
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  _inject_failure();
+
+  dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+int FileStore::_check_replay_guard(const coll_t& cid, const ghobject_t &oid,
+                                   const SequencerPosition& spos)
+{
+  if (!replaying || backend->can_checkpoint())
+    return 1;
+
+  int r = _check_global_replay_guard(cid, spos);
+  if (r < 0)
+    return r;
+
+  FDRef fd;
+  r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": " << cid << " " << oid << " dne" << dendl;
+    return 1;  // if file does not exist, there is no guard, and we can replay.
+  }
+  int ret = _check_replay_guard(**fd, spos);
+  lfn_close(fd);
+  return ret;
+}
+
+int FileStore::_check_replay_guard(const coll_t& cid, const SequencerPosition& spos)
+{
+  if (!replaying || backend->can_checkpoint())
+    return 1;
+
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    dout(10) << __FUNC__ << ": " << cid << " dne" << dendl;
+    return 1;  // if collection does not exist, there is no guard, and we can replay.
+  }
+  int ret = _check_replay_guard(fd, spos);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  return ret;
+}
+
+int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
+{
+  if (!replaying || backend->can_checkpoint())
+    return 1;
+
+  char buf[100];
+  int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf));
+  if (r < 0) {
+    dout(20) << __FUNC__ << ": no xattr" << dendl;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return 1;  // no xattr
+  }
+  bufferlist bl;
+  bl.append(buf, r);
+
+  SequencerPosition opos;
+  auto p = bl.cbegin();
+  decode(opos, p);
+  bool in_progress = false;
+  if (!p.end())   // older journals don't have this
+    decode(in_progress, p);
+  if (opos > spos) {
+    dout(10) << __FUNC__ << ": object has " << opos << " > current pos " << spos
+	     << ", now or in future, SKIPPING REPLAY" << dendl;
+    return -1;
+  } else if (opos == spos) {
+    if (in_progress) {
+      dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos
+	       << ", in_progress=true, CONDITIONAL REPLAY" << dendl;
+      return 0;
+    } else {
+      dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos
+	       << ", in_progress=false, SKIPPING REPLAY" << dendl;
+      return -1;
+    }
+  } else {
+    dout(10) << __FUNC__ << ": object has " << opos << " < current pos " << spos
+	     << ", in past, will replay" << dendl;
+    return 1;
+  }
+}
+
+void FileStore::_do_transaction(
+  Transaction& t, uint64_t op_seq, int trans_num,
+  ThreadPool::TPHandle *handle,
+  const char *osr_name)
+{
+  dout(10) << __FUNC__ << ": on " << &t << dendl;
+
+  Transaction::iterator i = t.begin();
+
+  SequencerPosition spos(op_seq, trans_num, 0);
+  while (i.have_op()) {
+    if (handle)
+      handle->reset_tp_timeout();
+
+    Transaction::Op *op = i.decode_op();
+    int r = 0;
+
+    _inject_failure();
+
+    switch (op->op) {
+    case Transaction::OP_NOP:
+      break;
+    case Transaction::OP_TOUCH:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, touch_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _touch(cid, oid);
+        tracepoint(objectstore, touch_exit, r);
+      }
+      break;
+
+    case Transaction::OP_WRITE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+        uint32_t fadvise_flags = i.get_fadvise_flags();
+        bufferlist bl;
+        i.decode_bl(bl);
+        tracepoint(objectstore, write_enter, osr_name, off, len);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _write(cid, oid, off, len, bl, fadvise_flags);
+        tracepoint(objectstore, write_exit, r);
+      }
+      break;
+
+    case Transaction::OP_ZERO:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+        tracepoint(objectstore, zero_enter, osr_name, off, len);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _zero(cid, oid, off, len);
+        tracepoint(objectstore, zero_exit, r);
+      }
+      break;
+
+    case Transaction::OP_TRIMCACHE:
+      {
+	// deprecated, no-op
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        uint64_t off = op->off;
+        tracepoint(objectstore, truncate_enter, osr_name, off);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _truncate(cid, oid, off);
+        tracepoint(objectstore, truncate_exit, r);
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, remove_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _remove(cid, oid, spos);
+        tracepoint(objectstore, remove_exit, r);
+      }
+      break;
+
+    case Transaction::OP_SETATTR:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        string name = i.decode_string();
+        bufferlist bl;
+        i.decode_bl(bl);
+        tracepoint(objectstore, setattr_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0) {
+          map<string, bufferptr> to_set;
+          to_set[name] = bufferptr(bl.c_str(), bl.length());
+          r = _setattrs(cid, oid, to_set, spos);
+          if (r == -ENOSPC)
+            dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid
+                    << " name " << name << " size " << bl.length() << dendl;
+        }
+        tracepoint(objectstore, setattr_exit, r);
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        map<string, bufferptr> aset;
+        i.decode_attrset(aset);
+        tracepoint(objectstore, setattrs_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _setattrs(cid, oid, aset, spos);
+        tracepoint(objectstore, setattrs_exit, r);
+        if (r == -ENOSPC)
+          dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl;
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        string name = i.decode_string();
+        tracepoint(objectstore, rmattr_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _rmattr(cid, oid, name.c_str(), spos);
+        tracepoint(objectstore, rmattr_exit, r);
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, rmattrs_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _rmattrs(cid, oid, spos);
+        tracepoint(objectstore, rmattrs_exit, r);
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        const ghobject_t &noid = i.get_oid(op->dest_oid);
+        tracepoint(objectstore, clone_enter, osr_name);
+        r = _clone(cid, oid, noid, spos);
+        tracepoint(objectstore, clone_exit, r);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const ghobject_t &noid = i.get_oid(op->dest_oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ?
+          _cid : _cid.get_temp();
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+        tracepoint(objectstore, clone_range_enter, osr_name, len);
+        r = _clone_range(cid, oid, ncid, noid, off, len, off, spos);
+        tracepoint(objectstore, clone_range_exit, r);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const ghobject_t &noid = i.get_oid(op->dest_oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ?
+          _cid : _cid.get_temp();
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+        tracepoint(objectstore, clone_range2_enter, osr_name, len);
+        r = _clone_range(cid, oid, ncid, noid, srcoff, len, dstoff, spos);
+        tracepoint(objectstore, clone_range2_exit, r);
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+        const coll_t &cid = i.get_cid(op->cid);
+        tracepoint(objectstore, mkcoll_enter, osr_name);
+        if (_check_replay_guard(cid, spos) > 0)
+          r = _create_collection(cid, op->split_bits, spos);
+        tracepoint(objectstore, mkcoll_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_SET_BITS:
+      {
+	const coll_t &cid = i.get_cid(op->cid);
+	int bits = op->split_bits;
+	r = _collection_set_bits(cid, bits);
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+        const coll_t &cid = i.get_cid(op->cid);
+        uint32_t type = op->hint_type;
+        bufferlist hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          if (_check_replay_guard(cid, spos) > 0) {
+            r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos);
+          }
+        } else {
+          // Ignore the hint
+          dout(10) << "Unrecognized collection hint type: " << type << dendl;
+        }
+      }
+      break;
+
+    case Transaction::OP_RMCOLL:
+      {
+        const coll_t &cid = i.get_cid(op->cid);
+        tracepoint(objectstore, rmcoll_enter, osr_name);
+        if (_check_replay_guard(cid, spos) > 0)
+          r = _destroy_collection(cid);
+        tracepoint(objectstore, rmcoll_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      {
+        const coll_t &ocid = i.get_cid(op->cid);
+        const coll_t &ncid = i.get_cid(op->dest_cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+
+	ceph_assert(oid.hobj.pool >= -1);
+
+        // always followed by OP_COLL_REMOVE
+        Transaction::Op *op2 = i.decode_op();
+        const coll_t &ocid2 = i.get_cid(op2->cid);
+        const ghobject_t &oid2 = i.get_oid(op2->oid);
+        ceph_assert(op2->op == Transaction::OP_COLL_REMOVE);
+        ceph_assert(ocid2 == ocid);
+        ceph_assert(oid2 == oid);
+
+        tracepoint(objectstore, coll_add_enter);
+        r = _collection_add(ncid, ocid, oid, spos);
+        tracepoint(objectstore, coll_add_exit, r);
+        spos.op++;
+        if (r < 0)
+          break;
+        tracepoint(objectstore, coll_remove_enter, osr_name);
+        if (_check_replay_guard(ocid, oid, spos) > 0)
+          r = _remove(ocid, oid, spos);
+        tracepoint(objectstore, coll_remove_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+      {
+        // WARNING: this is deprecated and buggy; only here to replay old journals.
+        const coll_t &ocid = i.get_cid(op->cid);
+        const coll_t &ncid = i.get_cid(op->dest_cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        tracepoint(objectstore, coll_move_enter);
+        r = _collection_add(ocid, ncid, oid, spos);
+        if (r == 0 &&
+            (_check_replay_guard(ocid, oid, spos) > 0))
+          r = _remove(ocid, oid, spos);
+        tracepoint(objectstore, coll_move_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+      {
+        const coll_t &_oldcid = i.get_cid(op->cid);
+        const ghobject_t &oldoid = i.get_oid(op->oid);
+        const coll_t &_newcid = i.get_cid(op->dest_cid);
+        const ghobject_t &newoid = i.get_oid(op->dest_oid);
+        const coll_t &oldcid = !_need_temp_object_collection(_oldcid, oldoid) ?
+          _oldcid : _oldcid.get_temp();
+        const coll_t &newcid = !_need_temp_object_collection(_newcid, newoid) ?
+          _oldcid : _newcid.get_temp();
+        tracepoint(objectstore, coll_move_rename_enter);
+        r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
+        tracepoint(objectstore, coll_move_rename_exit, r);
+      }
+      break;
+
+    case Transaction::OP_TRY_RENAME:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oldoid = i.get_oid(op->oid);
+        const ghobject_t &newoid = i.get_oid(op->dest_oid);
+        const coll_t &oldcid = !_need_temp_object_collection(_cid, oldoid) ?
+          _cid : _cid.get_temp();
+        const coll_t &newcid = !_need_temp_object_collection(_cid, newoid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, coll_try_rename_enter);
+        r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos, true);
+        tracepoint(objectstore, coll_try_rename_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+    case Transaction::OP_COLL_RMATTR:
+      ceph_abort_msg("collection attr methods no longer implemented");
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      {
+        r = -EOPNOTSUPP;
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, omap_clear_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_clear(cid, oid, spos);
+        tracepoint(objectstore, omap_clear_exit, r);
+      }
+      break;
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        map<string, bufferlist> aset;
+        i.decode_attrset(aset);
+        tracepoint(objectstore, omap_setkeys_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_setkeys(cid, oid, aset, spos);
+        tracepoint(objectstore, omap_setkeys_exit, r);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        set<string> keys;
+        i.decode_keyset(keys);
+        tracepoint(objectstore, omap_rmkeys_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_rmkeys(cid, oid, keys, spos);
+        tracepoint(objectstore, omap_rmkeys_exit, r);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+        tracepoint(objectstore, omap_rmkeyrange_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_rmkeyrange(cid, oid, first, last, spos);
+        tracepoint(objectstore, omap_rmkeyrange_exit, r);
+      }
+      break;
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        bufferlist bl;
+        i.decode_bl(bl);
+        tracepoint(objectstore, omap_setheader_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_setheader(cid, oid, bl, spos);
+        tracepoint(objectstore, omap_setheader_exit, r);
+      }
+      break;
+    case Transaction::OP_SPLIT_COLLECTION:
+      {
+	ceph_abort_msg("not legacy journal; upgrade to firefly first");
+      }
+      break;
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+        coll_t dest = i.get_cid(op->dest_cid);
+        tracepoint(objectstore, split_coll2_enter, osr_name);
+        r = _split_collection(cid, bits, rem, dest, spos);
+        tracepoint(objectstore, split_coll2_exit, r);
+      }
+      break;
+
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        coll_t dest = i.get_cid(op->dest_cid);
+        tracepoint(objectstore, merge_coll_enter, osr_name);
+        r = _merge_collection(cid, bits, dest, spos);
+        tracepoint(objectstore, merge_coll_exit, r);
+      }
+      break;
+
+    case Transaction::OP_SETALLOCHINT:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        uint64_t expected_object_size = op->expected_object_size;
+        uint64_t expected_write_size = op->expected_write_size;
+        tracepoint(objectstore, setallochint_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _set_alloc_hint(cid, oid, expected_object_size,
+                              expected_write_size);
+        tracepoint(objectstore, setallochint_exit, r);
+      }
+      break;
+
+    default:
+      derr << "bad op " << op->op << dendl;
+      ceph_abort();
+    }
+
+    if (r < 0) {
+      bool ok = false;
+
+      if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+			    op->op == Transaction::OP_CLONE ||
+			    op->op == Transaction::OP_CLONERANGE2 ||
+			    op->op == Transaction::OP_COLL_ADD ||
+			    op->op == Transaction::OP_SETATTR ||
+			    op->op == Transaction::OP_SETATTRS ||
+			    op->op == Transaction::OP_RMATTR ||
+			    op->op == Transaction::OP_OMAP_SETKEYS ||
+			    op->op == Transaction::OP_OMAP_RMKEYS ||
+			    op->op == Transaction::OP_OMAP_RMKEYRANGE ||
+			    op->op == Transaction::OP_OMAP_SETHEADER))
+	// -ENOENT is normally okay
+	// ...including on a replayed OP_RMCOLL with checkpoint mode
+	ok = true;
+      if (r == -ENODATA)
+	ok = true;
+
+      if (op->op == Transaction::OP_SETALLOCHINT)
+        // Either EOPNOTSUPP or EINVAL most probably.  EINVAL in most
+        // cases means invalid hint size (e.g. too big, not a multiple
+        // of block size, etc) or, at least on xfs, an attempt to set
+        // or change it when the file is not empty.  However,
+        // OP_SETALLOCHINT is advisory, so ignore all errors.
+        ok = true;
+
+      if (replaying && !backend->can_checkpoint()) {
+	if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) {
+	  dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+	  ok = true;
+	}
+	if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) {
+	  dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+	  ok = true;
+	}
+	if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) {
+	  dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+	  ok = true;
+	}
+	if (r == -ERANGE) {
+	  dout(10) << "tolerating ERANGE on replay" << dendl;
+	  ok = true;
+	}
+	if (r == -ENOENT) {
+	  dout(10) << "tolerating ENOENT on replay" << dendl;
+	  ok = true;
+	}
+      }
+
+      if (!ok) {
+	const char *msg = "unexpected error code";
+
+	if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+			     op->op == Transaction::OP_CLONE ||
+			     op->op == Transaction::OP_CLONERANGE2)) {
+	  msg = "ENOENT on clone suggests osd bug";
+	} else if (r == -ENOSPC) {
+	  // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+	  // by partially applying transactions.
+	  msg = "ENOSPC from disk filesystem, misconfigured cluster";
+	} else if (r == -ENOTEMPTY) {
+	  msg = "ENOTEMPTY suggests garbage data in osd data dir";
+	} else if (r == -EPERM) {
+          msg = "EPERM suggests file(s) in osd data dir not owned by ceph user, or leveldb corruption";
+        }
+
+	derr  << " error " << cpp_strerror(r) << " not handled on operation " << op
+	      << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl;
+	dout(0) << msg << dendl;
+	dout(0) << " transaction dump:\n";
+	JSONFormatter f(true);
+	f.open_object_section("transaction");
+	t.dump(&f);
+	f.close_section();
+	f.flush(*_dout);
+	*_dout << dendl;
+
+	if (r == -EMFILE) {
+	  dump_open_fds(cct);
+	}
+
+	ceph_abort_msg("unexpected error");
+      }
+    }
+
+    spos.op++;
+  }
+
+  _inject_failure();
+}
+
+  /*********************************************/
+
+
+
+// --------------------
+// objects
+
+bool FileStore::exists(CollectionHandle& ch, const ghobject_t& oid)
+{
+  tracepoint(objectstore, exists_enter, ch->cid.c_str());
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+  struct stat st;
+  bool retval = stat(ch, oid, &st) == 0;
+  tracepoint(objectstore, exists_exit, retval);
+  return retval;
+}
+
+int FileStore::stat(
+  CollectionHandle& ch, const ghobject_t& oid, struct stat *st, bool allow_eio)
+{
+  tracepoint(objectstore, stat_enter, ch->cid.c_str());
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+  int r = lfn_stat(cid, oid, st);
+  ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid
+	     << " = " << r << dendl;
+  } else {
+    dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid
+	     << " = " << r
+	     << " (size " << st->st_size << ")" << dendl;
+  }
+  if (cct->_conf->filestore_debug_inject_read_err &&
+      debug_mdata_eio(oid)) {
+    return -EIO;
+  } else {
+    tracepoint(objectstore, stat_exit, r);
+    return r;
+  }
+}
+
+int FileStore::set_collection_opts(
+  CollectionHandle& ch,
+  const pool_opts_t& opts)
+{
+  return -EOPNOTSUPP;
+}
+
+int FileStore::read(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  bufferlist& bl,
+  uint32_t op_flags)
+{
+  int got;
+  tracepoint(objectstore, read_enter, ch->cid.c_str(), offset, len);
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+
+  FDRef fd;
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") open error: "
+	     << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (offset == 0 && len == 0) {
+    struct stat st;
+    memset(&st, 0, sizeof(struct stat));
+    int r = ::fstat(**fd, &st);
+    ceph_assert(r == 0);
+    len = st.st_size;
+  }
+
+#ifdef HAVE_POSIX_FADVISE
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM)
+    posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM);
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)
+    posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL);
+#endif
+
+  bufferptr bptr(len);  // prealloc space for entire read
+  got = safe_pread(**fd, bptr.c_str(), len, offset);
+  if (got < 0) {
+    dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl;
+    lfn_close(fd);
+    return got;
+  }
+  bptr.set_length(got);   // properly size the buffer
+  bl.clear();
+  bl.push_back(std::move(bptr));   // put it in the target bufferlist
+
+#ifdef HAVE_POSIX_FADVISE
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
+    posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED);
+  if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL))
+    posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL);
+#endif
+
+  if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) {
+    ostringstream ss;
+    int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss);
+    if (errors != 0) {
+      dout(0) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~"
+	      << got << " ... BAD CRC:\n" << ss.str() << dendl;
+      ceph_abort_msg("bad crc on read");
+    }
+  }
+
+  lfn_close(fd);
+
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~"
+	   << got << "/" << len << dendl;
+  if (cct->_conf->filestore_debug_inject_read_err &&
+      debug_data_eio(oid)) {
+    return -EIO;
+  } else if (oid.hobj.pool > 0 &&  /* FIXME, see #23029 */
+	     cct->_conf->filestore_debug_random_read_err &&
+	     (rand() % (int)(cct->_conf->filestore_debug_random_read_err *
+			     100.0)) == 0) {
+    dout(0) << __func__ << ": inject random EIO" << dendl;
+    return -EIO;
+  } else {
+    tracepoint(objectstore, read_exit, got);
+    return got;
+  }
+}
+
+int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len,
+                          map<uint64_t, uint64_t> *m)
+{
+  uint64_t i;
+  struct fiemap_extent *extent = nullptr;
+  struct fiemap *fiemap = nullptr;
+  int r = 0;
+
+more:
+  r = backend->do_fiemap(fd, offset, len, &fiemap);
+  if (r < 0)
+    return r;
+
+  if (fiemap->fm_mapped_extents == 0) {
+    free(fiemap);
+    return r;
+  }
+
+  extent = &fiemap->fm_extents[0];
+
+  /* start where we were asked to start */
+  if (extent->fe_logical < offset) {
+    extent->fe_length -= offset - extent->fe_logical;
+    extent->fe_logical = offset;
+  }
+
+  i = 0;
+
+  struct fiemap_extent *last = nullptr;
+  while (i < fiemap->fm_mapped_extents) {
+    struct fiemap_extent *next = extent + 1;
+
+    dout(10) << __FUNC__ << ": fm_mapped_extents=" << fiemap->fm_mapped_extents
+             << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl;
+
+    /* try to merge extents */
+    while ((i < fiemap->fm_mapped_extents - 1) &&
+           (extent->fe_logical + extent->fe_length == next->fe_logical)) {
+        next->fe_length += extent->fe_length;
+        next->fe_logical = extent->fe_logical;
+        extent = next;
+        next = extent + 1;
+        i++;
+    }
+
+    if (extent->fe_logical + extent->fe_length > offset + len)
+      extent->fe_length = offset + len - extent->fe_logical;
+    (*m)[extent->fe_logical] = extent->fe_length;
+    i++;
+    last = extent++;
+  }
+  uint64_t xoffset = last->fe_logical + last->fe_length - offset;
+  offset = last->fe_logical + last->fe_length;
+  len -= xoffset;
+  const bool is_last = (last->fe_flags & FIEMAP_EXTENT_LAST) || (len == 0);
+  free(fiemap);
+  if (!is_last) {
+    goto more;
+  }
+
+  return r;
+}
+
+int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len,
+                                  map<uint64_t, uint64_t> *m)
+{
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+  off_t hole_pos, data_pos;
+  int r = 0;
+
+  // If lseek fails with errno setting to be ENXIO, this means the current
+  // file offset is beyond the end of the file.
+  off_t start = offset;
+  while(start < (off_t)(offset + len)) {
+    data_pos = lseek(fd, start, SEEK_DATA);
+    if (data_pos < 0) {
+      if (errno == ENXIO)
+        break;
+      else {
+        r = -errno;
+        dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+	return r;
+      }
+    } else if (data_pos > (off_t)(offset + len)) {
+      break;
+    }
+
+    hole_pos = lseek(fd, data_pos, SEEK_HOLE);
+    if (hole_pos < 0) {
+      if (errno == ENXIO) {
+        break;
+      } else {
+        r = -errno;
+        dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+	return r;
+      }
+    }
+
+    if (hole_pos >= (off_t)(offset + len)) {
+      (*m)[data_pos] = offset + len - data_pos;
+      break;
+    }
+    (*m)[data_pos] = hole_pos - data_pos;
+    start = hole_pos;
+  }
+
+  return r;
+#else
+  (*m)[offset] = len;
+  return 0;
+#endif
+}
+
+int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+                    uint64_t offset, size_t len,
+                    bufferlist& bl)
+{
+  map<uint64_t, uint64_t> exomap;
+  int r = fiemap(ch, oid, offset, len, exomap);
+  if (r >= 0) {
+    encode(exomap, bl);
+  }
+  return r;
+}
+
+int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+                    uint64_t offset, size_t len,
+                    map<uint64_t, uint64_t>& destmap)
+{
+  tracepoint(objectstore, fiemap_enter, ch->cid.c_str(), offset, len);
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+  destmap.clear();
+
+  if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) ||
+      len <= (size_t)m_filestore_fiemap_threshold) {
+    destmap[offset] = len;
+    return 0;
+  }
+
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+
+  FDRef fd;
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl;
+    goto done;
+  }
+
+  if (backend->has_seek_data_hole()) {
+    dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+    r = _do_seek_hole_data(**fd, offset, len, &destmap);
+  } else if (backend->has_fiemap()) {
+    dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl;
+    r = _do_fiemap(**fd, offset, len, &destmap);
+  }
+
+  lfn_close(fd);
+
+done:
+
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << destmap.size() << " " << destmap << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  tracepoint(objectstore, fiemap_exit, r);
+  return r;
+}
+
+int FileStore::_remove(const coll_t& cid, const ghobject_t& oid,
+		       const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+  int r = lfn_unlink(cid, oid, spos);
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << dendl;
+  int r = lfn_truncate(cid, oid, size);
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << " = " << r << dendl;
+  return r;
+}
+
+
+int FileStore::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+  FDRef fd;
+  int r = lfn_open(cid, oid, true, &fd);
+  if (r < 0) {
+    return r;
+  } else {
+    lfn_close(fd);
+  }
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_write(const coll_t& cid, const ghobject_t& oid,
+                     uint64_t offset, size_t len,
+                     const bufferlist& bl, uint32_t fadvise_flags)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+  int r;
+
+  FDRef fd;
+  r = lfn_open(cid, oid, true, &fd);
+  if (r < 0) {
+    dout(0) << __FUNC__ << ": couldn't open " << cid << "/"
+	    << oid << ": "
+	    << cpp_strerror(r) << dendl;
+    goto out;
+  }
+
+  // write
+  r = bl.write_fd(**fd, offset);
+  if (r < 0) {
+    derr << __FUNC__ << ": write_fd on " << cid << "/" << oid
+         << " error: " << cpp_strerror(r) << dendl;
+    lfn_close(fd);
+    goto out;
+  }
+  r = bl.length();
+
+  if (r >= 0 && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_write(**fd, offset, len, bl);
+    ceph_assert(rc >= 0);
+  }
+ 
+  if (replaying || m_disable_wbthrottle) {
+    if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) {
+#ifdef HAVE_POSIX_FADVISE
+        posix_fadvise(**fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+    }
+  } else {
+    wbthrottle.queue_wb(fd, oid, offset, len,
+        fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+  }
+ 
+  lfn_close(fd);
+
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+  int ret = 0;
+
+  if (cct->_conf->filestore_punch_hole) {
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(__APPLE__) && !defined(__FreeBSD__)
+#    ifdef FALLOC_FL_KEEP_SIZE
+    // first try to punch a hole.
+    FDRef fd;
+    ret = lfn_open(cid, oid, false, &fd);
+    if (ret < 0) {
+      goto out;
+    }
+
+    struct stat st;
+    ret = ::fstat(**fd, &st);
+    if (ret < 0) {
+      ret = -errno;
+      lfn_close(fd);
+      goto out;
+    }
+
+    // first try fallocate
+    ret = fallocate(**fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+		    offset, len);
+    if (ret < 0) {
+      ret = -errno;
+    } else {
+      // ensure we extend file size, if needed
+      if (len > 0 && offset + len > (uint64_t)st.st_size) {
+	ret = ::ftruncate(**fd, offset + len);
+	if (ret < 0) {
+	  ret = -errno;
+	  lfn_close(fd);
+	  goto out;
+	}
+      }
+    }
+    lfn_close(fd);
+
+    if (ret >= 0 && m_filestore_sloppy_crc) {
+      int rc = backend->_crc_update_zero(**fd, offset, len);
+      ceph_assert(rc >= 0);
+    }
+
+    if (ret == 0)
+      goto out;  // yay!
+    if (ret != -EOPNOTSUPP)
+      goto out;  // some other error
+#    endif
+# endif
+#endif
+  }
+
+  // lame, kernel is old and doesn't support it.
+  // write zeros.. yuck!
+  dout(20) << __FUNC__ << ": falling back to writing zeros" << dendl;
+  {
+    bufferlist bl;
+    bl.append_zero(len);
+    ret = _write(cid, oid, offset, len, bl);
+  }
+
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(__APPLE__) && !defined(__FreeBSD__)
+#    ifdef FALLOC_FL_KEEP_SIZE
+ out:
+#    endif
+# endif
+#endif
+  dout(20) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl;
+  return ret;
+}
+
+int FileStore::_clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+		      const SequencerPosition& spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl;
+
+  if (_check_replay_guard(cid, newoid, spos) < 0)
+    return 0;
+
+  int r;
+  FDRef o, n;
+  {
+    Index index;
+    r = lfn_open(cid, oldoid, false, &o, &index);
+    if (r < 0) {
+      goto out2;
+    }
+    ceph_assert(index.index);
+    RWLock::WLocker l((index.index)->access_lock);
+
+    r = lfn_open(cid, newoid, true, &n, &index);
+    if (r < 0) {
+      goto out;
+    }
+    r = ::ftruncate(**n, 0);
+    if (r < 0) {
+      r = -errno;
+      goto out3;
+    }
+    struct stat st;
+    r = ::fstat(**o, &st);
+    if (r < 0) {
+      r = -errno;
+      goto out3;
+    }
+
+    r = _do_clone_range(**o, **n, 0, st.st_size, 0);
+    if (r < 0) {
+      goto out3;
+    }
+
+    dout(20) << "objectmap clone" << dendl;
+    r = object_map->clone(oldoid, newoid, &spos);
+    if (r < 0 && r != -ENOENT)
+      goto out3;
+  }
+
+  {
+    char buf[2];
+    map<string, bufferptr> aset;
+    r = _fgetattrs(**o, aset);
+    if (r < 0)
+      goto out3;
+
+    r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+    if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+      r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+                          sizeof(XATTR_NO_SPILL_OUT));
+    } else {
+      r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+                          sizeof(XATTR_SPILL_OUT));
+    }
+    if (r < 0)
+      goto out3;
+
+    r = _fsetattrs(**n, aset);
+    if (r < 0)
+      goto out3;
+  }
+
+  // clone is non-idempotent; record our work.
+  _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+  lfn_close(n);
+ out:
+  lfn_close(o);
+ out2:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+  dout(20) << __FUNC__ << ": copy " << srcoff << "~" << len << " to " << dstoff << dendl;
+  return backend->clone_range(from, to, srcoff, len, dstoff);
+}
+
+int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+  dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl;
+  int r = 0;
+  map<uint64_t, uint64_t> exomap;
+  // fiemap doesn't allow zero length
+  if (len == 0)
+    return 0;
+
+  if (backend->has_seek_data_hole()) {
+    dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl;
+    r = _do_seek_hole_data(from, srcoff, len, &exomap);
+  } else if (backend->has_fiemap()) {
+    dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl;
+    r = _do_fiemap(from, srcoff, len, &exomap);
+  }
+
+ 
+ int64_t written = 0;
+ if (r < 0)
+    goto out;
+
+  for (map<uint64_t, uint64_t>::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) {
+    uint64_t it_off = miter->first - srcoff + dstoff;
+    r = _do_copy_range(from, to, miter->first, miter->second, it_off, true);
+    if (r < 0) {
+      derr << __FUNC__ << ": copy error at " << miter->first << "~" << miter->second
+             << " to " << it_off << ", " << cpp_strerror(r) << dendl;
+      break;
+    }
+    written += miter->second;
+  }
+
+  if (r >= 0) {
+    if (m_filestore_sloppy_crc) {
+      int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+      ceph_assert(rc >= 0);
+    }
+    struct stat st;
+    r = ::fstat(to, &st);
+    if (r < 0) {
+      r = -errno;
+      derr << __FUNC__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl;
+      goto out;
+    }
+    if (st.st_size < (int)(dstoff + len)) {
+      r = ::ftruncate(to, dstoff + len);
+      if (r < 0) {
+        r = -errno;
+        derr << __FUNC__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl;
+        goto out;
+      }
+    }
+    r = written;
+  }
+
+ out:
+  dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc)
+{
+  dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl;
+  int r = 0;
+  loff_t pos = srcoff;
+  loff_t end = srcoff + len;
+  int buflen = 4096 * 16; //limit by pipe max size.see fcntl
+
+#ifdef CEPH_HAVE_SPLICE
+  if (backend->has_splice()) {
+    int pipefd[2];
+    if (pipe_cloexec(pipefd) < 0) {
+      int e = errno;
+      derr << " pipe " << " got " << cpp_strerror(e) << dendl;
+      return -e;
+    }
+
+    loff_t dstpos = dstoff;
+    while (pos < end) {
+      int l = std::min<int>(end-pos, buflen);
+      r = safe_splice(from, &pos, pipefd[1], nullptr, l, SPLICE_F_NONBLOCK);
+      dout(10) << "  safe_splice read from " << pos << "~" << l << " got " << r << dendl;
+      if (r < 0) {
+	derr << __FUNC__ << ": safe_splice read error at " << pos << "~" << len
+	  << ", " << cpp_strerror(r) << dendl;
+	break;
+      }
+      if (r == 0) {
+	// hrm, bad source range, wtf.
+	r = -ERANGE;
+	derr << __FUNC__ << ": got short read result at " << pos
+	  << " of fd " << from << " len " << len << dendl;
+	break;
+      }
+
+      r = safe_splice(pipefd[0], nullptr, to, &dstpos, r, 0);
+      dout(10) << " safe_splice write to " << to << " len " << r
+	<< " got " << r << dendl;
+      if (r < 0) {
+	derr << __FUNC__ << ": write error at " << pos << "~"
+	  << r << ", " << cpp_strerror(r) << dendl;
+	break;
+      }
+    }
+    close(pipefd[0]);
+    close(pipefd[1]);
+  } else
+#endif
+  {
+    int64_t actual;
+
+    actual = ::lseek64(from, srcoff, SEEK_SET);
+    if (actual != (int64_t)srcoff) {
+      if (actual < 0)
+        r = -errno;
+      else
+        r = -EINVAL;
+      derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    actual = ::lseek64(to, dstoff, SEEK_SET);
+    if (actual != (int64_t)dstoff) {
+      if (actual < 0)
+        r = -errno;
+      else
+        r = -EINVAL;
+      derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    char buf[buflen];
+    while (pos < end) {
+      int l = std::min<int>(end-pos, buflen);
+      r = ::read(from, buf, l);
+      dout(25) << "  read from " << pos << "~" << l << " got " << r << dendl;
+      if (r < 0) {
+	if (errno == EINTR) {
+	  continue;
+	} else {
+	  r = -errno;
+	  derr << __FUNC__ << ": read error at " << pos << "~" << len
+	    << ", " << cpp_strerror(r) << dendl;
+	  break;
+	}
+      }
+      if (r == 0) {
+	// hrm, bad source range, wtf.
+	r = -ERANGE;
+	derr << __FUNC__ << ": got short read result at " << pos
+	  << " of fd " << from << " len " << len << dendl;
+	break;
+      }
+      int op = 0;
+      while (op < r) {
+	int r2 = safe_write(to, buf+op, r-op);
+	dout(25) << " write to " << to << " len " << (r-op)
+	  << " got " << r2 << dendl;
+	if (r2 < 0) {
+	  r = r2;
+	  derr << __FUNC__ << ": write error at " << pos << "~"
+	    << r-op << ", " << cpp_strerror(r) << dendl;
+
+	  break;
+	}
+	op += (r-op);
+      }
+      if (r < 0)
+	break;
+      pos += r;
+    }
+  }
+
+  if (r < 0 && replaying) {
+    ceph_assert(r == -ERANGE);
+    derr << __FUNC__ << ": short source tolerated because we are replaying" << dendl;
+    r = len;
+  }
+  ceph_assert(replaying || pos == end);
+  if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+    ceph_assert(rc >= 0);
+  }
+  dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
+			    uint64_t srcoff, uint64_t len, uint64_t dstoff,
+			    const SequencerPosition& spos)
+{
+  dout(15) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl;
+
+  if (_check_replay_guard(newcid, newoid, spos) < 0)
+    return 0;
+
+  int r;
+  FDRef o, n;
+  r = lfn_open(oldcid, oldoid, false, &o);
+  if (r < 0) {
+    goto out2;
+  }
+  r = lfn_open(newcid, newoid, true, &n);
+  if (r < 0) {
+    goto out;
+  }
+  r = _do_clone_range(**o, **n, srcoff, len, dstoff);
+  if (r < 0) {
+    goto out3;
+  }
+
+  // clone is non-idempotent; record our work.
+  _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+  lfn_close(n);
+ out:
+  lfn_close(o);
+ out2:
+  dout(10) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " "
+	   << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+  return r;
+}
+
+class SyncEntryTimeout : public Context {
+public:
+  CephContext* cct;
+  explicit SyncEntryTimeout(CephContext* cct, int commit_timeo)
+    : cct(cct), m_commit_timeo(commit_timeo)
+  {
+  }
+
+  void finish(int r) override {
+    BackTrace *bt = new BackTrace(1);
+    generic_dout(-1) << "FileStore: sync_entry timed out after "
+	   << m_commit_timeo << " seconds.\n";
+    bt->print(*_dout);
+    *_dout << dendl;
+    delete bt;
+    bt = nullptr;
+    ceph_abort();
+  }
+private:
+  int m_commit_timeo;
+};
+
+void FileStore::sync_entry()
+{
+  lock.Lock();
+  while (!stop) {
+    utime_t max_interval;
+    max_interval.set_from_double(m_filestore_max_sync_interval);
+    utime_t min_interval;
+    min_interval.set_from_double(m_filestore_min_sync_interval);
+
+    utime_t startwait = ceph_clock_now();
+    if (!force_sync) {
+      dout(20) << __FUNC__ << ":  waiting for max_interval " << max_interval << dendl;
+      sync_cond.WaitInterval(lock, max_interval);
+    } else {
+      dout(20) << __FUNC__ << ": not waiting, force_sync set" << dendl;
+    }
+
+    if (force_sync) {
+      dout(20) << __FUNC__ << ": force_sync set" << dendl;
+      force_sync = false;
+    } else if (stop) {
+      dout(20) << __FUNC__ << ": stop set" << dendl;
+      break;
+    } else {
+      // wait for at least the min interval
+      utime_t woke = ceph_clock_now();
+      woke -= startwait;
+      dout(20) << __FUNC__ << ": woke after " << woke << dendl;
+      if (woke < min_interval) {
+	utime_t t = min_interval;
+	t -= woke;
+	dout(20) << __FUNC__ << ": waiting for another " << t
+		 << " to reach min interval " << min_interval << dendl;
+	sync_cond.WaitInterval(lock, t);
+      }
+    }
+
+    list<Context*> fin;
+  again:
+    fin.swap(sync_waiters);
+    lock.Unlock();
+
+    op_tp.pause();
+    if (apply_manager.commit_start()) {
+      utime_t start = ceph_clock_now();
+      uint64_t cp = apply_manager.get_committing_seq();
+
+      sync_entry_timeo_lock.Lock();
+      SyncEntryTimeout *sync_entry_timeo =
+	new SyncEntryTimeout(cct, m_filestore_commit_timeout);
+      if (!timer.add_event_after(m_filestore_commit_timeout,
+				 sync_entry_timeo)) {
+	sync_entry_timeo = nullptr;
+      }
+      sync_entry_timeo_lock.Unlock();
+
+      logger->set(l_filestore_committing, 1);
+
+      dout(15) << __FUNC__ << ": committing " << cp << dendl;
+      stringstream errstream;
+      if (cct->_conf->filestore_debug_omap_check && !object_map->check(errstream)) {
+	derr << errstream.str() << dendl;
+	ceph_abort();
+      }
+
+      if (backend->can_checkpoint()) {
+	int err = write_op_seq(op_fd, cp);
+	if (err < 0) {
+	  derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("error during write_op_seq");
+	}
+
+	char s[NAME_MAX];
+	snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+	uint64_t cid = 0;
+	err = backend->create_checkpoint(s, &cid);
+	if (err < 0) {
+	    int err = errno;
+	    derr << "snap create '" << s << "' got error " << err << dendl;
+	    ceph_assert(err == 0);
+	}
+
+	snaps.push_back(cp);
+	apply_manager.commit_started();
+	op_tp.unpause();
+
+	if (cid > 0) {
+	  dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl;
+	  err = backend->sync_checkpoint(cid);
+	  if (err < 0) {
+	    derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl;
+	    ceph_abort_msg("wait_sync got error");
+	  }
+	  dout(20) << " done waiting for checkpoint " << cid << " to complete" << dendl;
+	}
+      } else {
+	apply_manager.commit_started();
+	op_tp.unpause();
+
+	int err = object_map->sync();
+	if (err < 0) {
+	  derr << "object_map sync got " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("object_map sync returned error");
+	}
+
+	err = backend->syncfs();
+	if (err < 0) {
+	  derr << "syncfs got " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("syncfs returned error");
+	}
+
+	err = write_op_seq(op_fd, cp);
+	if (err < 0) {
+	  derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("error during write_op_seq");
+	}
+	err = ::fsync(op_fd);
+	if (err < 0) {
+	  derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("error during fsync of op_seq");
+	}
+      }
+
+      utime_t done = ceph_clock_now();
+      utime_t lat = done - start;
+      utime_t dur = done - startwait;
+      dout(10) << __FUNC__ << ": commit took " << lat << ", interval was " << dur << dendl;
+      utime_t max_pause_lat = logger->tget(l_filestore_sync_pause_max_lat);
+      if (max_pause_lat < dur - lat) {
+        logger->tinc(l_filestore_sync_pause_max_lat, dur - lat);
+      }
+
+      logger->inc(l_filestore_commitcycle);
+      logger->tinc(l_filestore_commitcycle_latency, lat);
+      logger->tinc(l_filestore_commitcycle_interval, dur);
+
+      apply_manager.commit_finish();
+      if (!m_disable_wbthrottle) {
+        wbthrottle.clear();
+      }
+
+      logger->set(l_filestore_committing, 0);
+
+      // remove old snaps?
+      if (backend->can_checkpoint()) {
+	char s[NAME_MAX];
+	while (snaps.size() > 2) {
+	  snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front());
+	  snaps.pop_front();
+	  dout(10) << "removing snap '" << s << "'" << dendl;
+	  int r = backend->destroy_checkpoint(s);
+	  if (r) {
+	    int err = errno;
+	    derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl;
+	  }
+	}
+      }
+
+      dout(15) << __FUNC__ << ": committed to op_seq " << cp << dendl;
+
+      if (sync_entry_timeo) {
+	Mutex::Locker lock(sync_entry_timeo_lock);
+	timer.cancel_event(sync_entry_timeo);
+      }
+    } else {
+      op_tp.unpause();
+    }
+
+    lock.Lock();
+    finish_contexts(cct, fin, 0);
+    fin.clear();
+    if (!sync_waiters.empty()) {
+      dout(10) << __FUNC__ << ": more waiters, committing again" << dendl;
+      goto again;
+    }
+    if (!stop && journal && journal->should_commit_now()) {
+      dout(10) << __FUNC__ << ": journal says we should commit again (probably is/was full)" << dendl;
+      goto again;
+    }
+  }
+  stop = false;
+  lock.Unlock();
+}
+
+void FileStore::do_force_sync()
+{
+  dout(10) << __FUNC__ << dendl;
+  Mutex::Locker l(lock);
+  force_sync = true;
+  sync_cond.Signal();
+}
+
+void FileStore::start_sync(Context *onsafe)
+{
+  Mutex::Locker l(lock);
+  sync_waiters.push_back(onsafe);
+  sync_cond.Signal();
+  force_sync = true;
+  dout(10) << __FUNC__ << dendl;
+}
+
+void FileStore::sync()
+{
+  Mutex l("FileStore::sync");
+  Cond c;
+  bool done;
+  C_SafeCond *fin = new C_SafeCond(&l, &c, &done);
+
+  start_sync(fin);
+
+  l.Lock();
+  while (!done) {
+    dout(10) << "sync waiting" << dendl;
+    c.Wait(l);
+  }
+  l.Unlock();
+  dout(10) << "sync done" << dendl;
+}
+
+void FileStore::_flush_op_queue()
+{
+  dout(10) << __FUNC__ << ": draining op tp" << dendl;
+  op_wq.drain();
+  dout(10) << __FUNC__ << ": waiting for apply finisher" << dendl;
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    (*it)->wait_for_empty();
+  }
+}
+
+/*
+ * flush - make every queued write readable
+ */
+void FileStore::flush()
+{
+  dout(10) << __FUNC__ << dendl;
+
+  if (cct->_conf->filestore_blackhole) {
+    // wait forever
+    Mutex lock("FileStore::flush::lock");
+    Cond cond;
+    lock.Lock();
+    while (true)
+      cond.Wait(lock);
+    ceph_abort();
+  }
+
+  if (m_filestore_journal_writeahead) {
+    if (journal)
+      journal->flush();
+    dout(10) << __FUNC__ << ": draining ondisk finisher" << dendl;
+    for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+      (*it)->wait_for_empty();
+    }
+  }
+
+  _flush_op_queue();
+  dout(10) << __FUNC__ << ": complete" << dendl;
+}
+
+/*
+ * sync_and_flush - make every queued write readable AND committed to disk
+ */
+void FileStore::sync_and_flush()
+{
+  dout(10) << __FUNC__ << dendl;
+
+  if (m_filestore_journal_writeahead) {
+    if (journal)
+      journal->flush();
+    _flush_op_queue();
+  } else {
+    // includes m_filestore_journal_parallel
+    _flush_op_queue();
+    sync();
+  }
+  dout(10) << __FUNC__ << ": done" << dendl;
+}
+
+int FileStore::flush_journal()
+{
+  dout(10) << __FUNC__ << dendl;
+  sync_and_flush();
+  sync();
+  return 0;
+}
+
+int FileStore::snapshot(const string& name)
+{
+  dout(10) << __FUNC__ << ": " << name << dendl;
+  sync_and_flush();
+
+  if (!backend->can_checkpoint()) {
+    dout(0) << __FUNC__ << ": " << name << " failed, not supported" << dendl;
+    return -EOPNOTSUPP;
+  }
+
+  char s[NAME_MAX];
+  snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str());
+
+  int r = backend->create_checkpoint(s, nullptr);
+  if (r) {
+    derr << __FUNC__ << ": " << name << " failed: " << cpp_strerror(r) << dendl;
+  }
+
+  return r;
+}
+
+// -------------------------------
+// attributes
+
+int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp)
+{
+  char val[CHAIN_XATTR_MAX_BLOCK_LEN];
+  int l = chain_fgetxattr(fd, name, val, sizeof(val));
+  if (l >= 0) {
+    bp = buffer::create(l);
+    memcpy(bp.c_str(), val, l);
+  } else if (l == -ERANGE) {
+    l = chain_fgetxattr(fd, name, 0, 0);
+    if (l > 0) {
+      bp = buffer::create(l);
+      l = chain_fgetxattr(fd, name, bp.c_str(), l);
+    }
+  }
+  ceph_assert(!m_filestore_fail_eio || l != -EIO);
+  return l;
+}
+
+int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset)
+{
+  // get attr list
+  char names1[100];
+  int len = chain_flistxattr(fd, names1, sizeof(names1)-1);
+  char *names2 = 0;
+  char *name = 0;
+  if (len == -ERANGE) {
+    len = chain_flistxattr(fd, 0, 0);
+    if (len < 0) {
+      ceph_assert(!m_filestore_fail_eio || len != -EIO);
+      return len;
+    }
+    dout(10) << " -ERANGE, len is " << len << dendl;
+    names2 = new char[len+1];
+    len = chain_flistxattr(fd, names2, len);
+    dout(10) << " -ERANGE, got " << len << dendl;
+    if (len < 0) {
+      ceph_assert(!m_filestore_fail_eio || len != -EIO);
+      delete[] names2;
+      return len;
+    }
+    name = names2;
+  } else if (len < 0) {
+    ceph_assert(!m_filestore_fail_eio || len != -EIO);
+    return len;
+  } else {
+    name = names1;
+  }
+  name[len] = 0;
+
+  char *end = name + len;
+  while (name < end) {
+    char *attrname = name;
+    if (parse_attrname(&name)) {
+      if (*name) {
+        dout(20) << __FUNC__ << ": " << fd << " getting '" << name << "'" << dendl;
+        int r = _fgetattr(fd, attrname, aset[name]);
+        if (r < 0) {
+	  delete[] names2;
+	  return r;
+        }
+      }
+    }
+    name += strlen(name) + 1;
+  }
+
+  delete[] names2;
+  return 0;
+}
+
+int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset)
+{
+  for (map<string, bufferptr>::iterator p = aset.begin();
+       p != aset.end();
+       ++p) {
+    char n[CHAIN_XATTR_MAX_NAME_LEN];
+    get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+    const char *val;
+    if (p->second.length())
+      val = p->second.c_str();
+    else
+      val = "";
+    // ??? Why do we skip setting all the other attrs if one fails?
+    int r = chain_fsetxattr(fd, n, val, p->second.length());
+    if (r < 0) {
+      derr << __FUNC__ << ": chain_setxattr returned " << r << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+// debug EIO injection
+void FileStore::inject_data_error(const ghobject_t &oid) {
+  Mutex::Locker l(read_error_lock);
+  dout(10) << __FUNC__ << ": init error on " << oid << dendl;
+  data_error_set.insert(oid);
+}
+void FileStore::inject_mdata_error(const ghobject_t &oid) {
+  Mutex::Locker l(read_error_lock);
+  dout(10) << __FUNC__ << ": init error on " << oid << dendl;
+  mdata_error_set.insert(oid);
+}
+
+void FileStore::debug_obj_on_delete(const ghobject_t &oid) {
+  Mutex::Locker l(read_error_lock);
+  dout(10) << __FUNC__ << ": clear error on " << oid << dendl;
+  data_error_set.erase(oid);
+  mdata_error_set.erase(oid);
+}
+bool FileStore::debug_data_eio(const ghobject_t &oid) {
+  Mutex::Locker l(read_error_lock);
+  if (data_error_set.count(oid)) {
+    dout(10) << __FUNC__ << ": inject error on " << oid << dendl;
+    return true;
+  } else {
+    return false;
+  }
+}
+bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
+  Mutex::Locker l(read_error_lock);
+  if (mdata_error_set.count(oid)) {
+    dout(10) << __FUNC__ << ": inject error on " << oid << dendl;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+// objects
+
+int FileStore::getattr(CollectionHandle& ch, const ghobject_t& oid, const char *name, bufferptr &bp)
+{
+  tracepoint(objectstore, getattr_enter, ch->cid.c_str());
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+
+  FDRef fd;
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+  char n[CHAIN_XATTR_MAX_NAME_LEN];
+  get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+  r = _fgetattr(**fd, n, bp);
+  lfn_close(fd);
+  if (r == -ENODATA) {
+    map<string, bufferlist> got;
+    set<string> to_get;
+    to_get.insert(string(name));
+    Index index;
+    r = get_index(cid, &index);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+      goto out;
+    }
+    r = object_map->get_xattrs(oid, to_get, &got);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": get_xattrs err r =" << r << dendl;
+      goto out;
+    }
+    if (got.empty()) {
+      dout(10) << __FUNC__ << ": got.size() is 0" << dendl;
+      return -ENODATA;
+    }
+    bp = bufferptr(got.begin()->second.c_str(),
+		   got.begin()->second.length());
+    r = bp.length();
+  }
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  if (cct->_conf->filestore_debug_inject_read_err &&
+      debug_mdata_eio(oid)) {
+    return -EIO;
+  } else {
+    tracepoint(objectstore, getattr_exit, r);
+    return r < 0 ? r : 0;
+  }
+}
+
+int FileStore::getattrs(CollectionHandle& ch, const ghobject_t& oid, map<string,bufferptr>& aset)
+{
+  tracepoint(objectstore, getattrs_enter, ch->cid.c_str());
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+  set<string> omap_attrs;
+  map<string, bufferlist> omap_aset;
+  Index index;
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+
+  FDRef fd;
+  bool spill_out = true;
+  char buf[2];
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+
+  r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+  if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+    spill_out = false;
+
+  r = _fgetattrs(**fd, aset);
+  lfn_close(fd);
+  fd = FDRef(); // defensive
+  if (r < 0) {
+    goto out;
+  }
+
+  if (!spill_out) {
+    dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl;
+    goto out;
+  }
+
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+    goto out;
+  }
+  {
+    r = object_map->get_all_xattrs(oid, &omap_attrs);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+      goto out;
+    }
+
+    r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+      goto out;
+    }
+    if (r == -ENOENT)
+      r = 0;
+  }
+  ceph_assert(omap_attrs.size() == omap_aset.size());
+  for (map<string, bufferlist>::iterator i = omap_aset.begin();
+	 i != omap_aset.end();
+	 ++i) {
+    string key(i->first);
+    aset.insert(make_pair(key,
+			    bufferptr(i->second.c_str(), i->second.length())));
+  }
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+
+  if (cct->_conf->filestore_debug_inject_read_err &&
+      debug_mdata_eio(oid)) {
+    return -EIO;
+  } else {
+    tracepoint(objectstore, getattrs_exit, r);
+    return r;
+  }
+}
+
+int FileStore::_setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset,
+			 const SequencerPosition &spos)
+{
+  map<string, bufferlist> omap_set;
+  set<string> omap_remove;
+  map<string, bufferptr> inline_set;
+  map<string, bufferptr> inline_to_set;
+  FDRef fd;
+  int spill_out = -1;
+  bool incomplete_inline = false;
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+
+  char buf[2];
+  r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+  if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+    spill_out = 0;
+  else
+    spill_out = 1;
+
+  r = _fgetattrs(**fd, inline_set);
+  incomplete_inline = (r == -E2BIG);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid
+	   << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "")
+	   << dendl;
+
+  for (map<string,bufferptr>::iterator p = aset.begin();
+       p != aset.end();
+       ++p) {
+    char n[CHAIN_XATTR_MAX_NAME_LEN];
+    get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+
+    if (incomplete_inline) {
+      chain_fremovexattr(**fd, n); // ignore any error
+      omap_set[p->first].push_back(p->second);
+      continue;
+    }
+
+    if (p->second.length() > m_filestore_max_inline_xattr_size) {
+	if (inline_set.count(p->first)) {
+	  inline_set.erase(p->first);
+	  r = chain_fremovexattr(**fd, n);
+	  if (r < 0)
+	    goto out_close;
+	}
+	omap_set[p->first].push_back(p->second);
+	continue;
+    }
+
+    if (!inline_set.count(p->first) &&
+	  inline_set.size() >= m_filestore_max_inline_xattrs) {
+	omap_set[p->first].push_back(p->second);
+	continue;
+    }
+    omap_remove.insert(p->first);
+    inline_set.insert(*p);
+
+    inline_to_set.insert(*p);
+  }
+
+  if (spill_out != 1 && !omap_set.empty()) {
+    chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+		    sizeof(XATTR_SPILL_OUT));
+  }
+
+  r = _fsetattrs(**fd, inline_to_set);
+  if (r < 0)
+    goto out_close;
+
+  if (spill_out && !omap_remove.empty()) {
+    r = object_map->remove_xattrs(oid, omap_remove, &spos);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not remove_xattrs r = " << r << dendl;
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      goto out_close;
+    } else {
+      r = 0; // don't confuse the debug output
+    }
+  }
+
+  if (!omap_set.empty()) {
+    r = object_map->set_xattrs(oid, omap_set, &spos);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": could not set_xattrs r = " << r << dendl;
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      goto out_close;
+    }
+  }
+ out_close:
+  lfn_close(fd);
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  return r;
+}
+
+
+int FileStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
+		       const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl;
+  FDRef fd;
+  bool spill_out = true;
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+
+  char buf[2];
+  r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+  if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+    spill_out = false;
+  }
+
+  char n[CHAIN_XATTR_MAX_NAME_LEN];
+  get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+  r = chain_fremovexattr(**fd, n);
+  if (r == -ENODATA && spill_out) {
+    Index index;
+    r = get_index(cid, &index);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+      goto out_close;
+    }
+    set<string> to_remove;
+    to_remove.insert(string(name));
+    r = object_map->remove_xattrs(oid, to_remove, &spos);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not remove_xattrs index r = " << r << dendl;
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      goto out_close;
+    }
+  }
+ out_close:
+  lfn_close(fd);
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+  return r;
+}
+
+int FileStore::_rmattrs(const coll_t& cid, const ghobject_t& oid,
+			const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+  map<string,bufferptr> aset;
+  FDRef fd;
+  set<string> omap_attrs;
+  Index index;
+  bool spill_out = true;
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+
+  char buf[2];
+  r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+  if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+    spill_out = false;
+  }
+
+  r = _fgetattrs(**fd, aset);
+  if (r >= 0) {
+    for (map<string,bufferptr>::iterator p = aset.begin(); p != aset.end(); ++p) {
+      char n[CHAIN_XATTR_MAX_NAME_LEN];
+      get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+      r = chain_fremovexattr(**fd, n);
+      if (r < 0) {
+        dout(10) << __FUNC__ << ": could not remove xattr r = " << r << dendl;
+	goto out_close;
+      }
+    }
+  }
+
+  if (!spill_out) {
+    dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl;
+    goto out_close;
+  }
+
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+    goto out_close;
+  }
+  {
+    r = object_map->get_all_xattrs(oid, &omap_attrs);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      goto out_close;
+    }
+    r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not remove omap_attrs r = " << r << dendl;
+      goto out_close;
+    }
+    if (r == -ENOENT)
+      r = 0;
+    chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+		  sizeof(XATTR_NO_SPILL_OUT));
+  }
+
+ out_close:
+  lfn_close(fd);
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  return r;
+}
+
+
+
+
+int FileStore::_collection_remove_recursive(const coll_t &cid,
+					    const SequencerPosition &spos)
+{
+  struct stat st;
+  int r = collection_stat(cid, &st);
+  if (r < 0) {
+    if (r == -ENOENT)
+      return 0;
+    return r;
+  }
+
+  vector<ghobject_t> objects;
+  ghobject_t max;
+  while (!max.is_max()) {
+    r = collection_list(cid, max, ghobject_t::get_max(),
+			300, &objects, &max);
+    if (r < 0)
+      return r;
+    for (vector<ghobject_t>::iterator i = objects.begin();
+	 i != objects.end();
+	 ++i) {
+      ceph_assert(_check_replay_guard(cid, *i, spos));
+      r = _remove(cid, *i, spos);
+      if (r < 0)
+	return r;
+    }
+    objects.clear();
+  }
+  return _destroy_collection(cid);
+}
+
+// --------------------------
+// collections
+
+int FileStore::list_collections(vector<coll_t>& ls)
+{
+  return list_collections(ls, false);
+}
+
+int FileStore::list_collections(vector<coll_t>& ls, bool include_temp)
+{
+  tracepoint(objectstore, list_collections_enter);
+  dout(10) << __FUNC__ << dendl;
+
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/current", basedir.c_str());
+
+  int r = 0;
+  DIR *dir = ::opendir(fn);
+  if (!dir) {
+    r = -errno;
+    derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+
+  struct dirent *de = nullptr;
+  while ((de = ::readdir(dir))) {
+    if (de->d_type == DT_UNKNOWN) {
+      // d_type not supported (non-ext[234], btrfs), must stat
+      struct stat sb;
+      char filename[PATH_MAX];
+      if (int n = snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name);
+	  n >= static_cast<int>(sizeof(filename))) {
+	derr << __func__ << " path length overrun: " << n << dendl;
+	ceph_abort();
+      }
+
+      r = ::stat(filename, &sb);
+      if (r < 0) {
+	r = -errno;
+	derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl;
+	if (r == -EIO && m_filestore_fail_eio) handle_eio();
+	break;
+      }
+      if (!S_ISDIR(sb.st_mode)) {
+	continue;
+      }
+    } else if (de->d_type != DT_DIR) {
+      continue;
+    }
+    if (strcmp(de->d_name, "omap") == 0) {
+      continue;
+    }
+    if (de->d_name[0] == '.' &&
+	(de->d_name[1] == '\0' ||
+	 (de->d_name[1] == '.' &&
+	  de->d_name[2] == '\0')))
+      continue;
+    coll_t cid;
+    if (!cid.parse(de->d_name)) {
+      derr << "ignoring invalid collection '" << de->d_name << "'" << dendl;
+      continue;
+    }
+    if (!cid.is_temp() || include_temp)
+      ls.push_back(cid);
+  }
+
+  if (r > 0) {
+    derr << "trying readdir " << fn << ": " << cpp_strerror(r) << dendl;
+    r = -r;
+  }
+
+  ::closedir(dir);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  tracepoint(objectstore, list_collections_exit, r);
+  return r;
+}
+
+int FileStore::collection_stat(const coll_t& c, struct stat *st)
+{
+  tracepoint(objectstore, collection_stat_enter, c.c_str());
+  char fn[PATH_MAX];
+  get_cdir(c, fn, sizeof(fn));
+  dout(15) << __FUNC__ << ": " << fn << dendl;
+  int r = ::stat(fn, st);
+  if (r < 0)
+    r = -errno;
+  dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  tracepoint(objectstore, collection_stat_exit, r);
+  return r;
+}
+
+bool FileStore::collection_exists(const coll_t& c)
+{
+  tracepoint(objectstore, collection_exists_enter, c.c_str());
+  struct stat st;
+  bool ret = collection_stat(c, &st) == 0;
+  tracepoint(objectstore, collection_exists_exit, ret);
+  return ret;
+}
+
+int FileStore::collection_empty(const coll_t& cid, bool *empty)
+{
+  tracepoint(objectstore, collection_empty_enter, cid.c_str());
+  dout(15) << __FUNC__ << ": " << cid << dendl;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0) {
+    derr << __FUNC__ << ": get_index returned: " << cpp_strerror(r)
+         << dendl;
+    return r;
+  }
+
+  ceph_assert(index.index);
+  RWLock::RLocker l((index.index)->access_lock);
+
+  vector<ghobject_t> ls;
+  r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(),
+				     1, &ls, nullptr);
+  if (r < 0) {
+    derr << __FUNC__ << ": collection_list_partial returned: "
+         << cpp_strerror(r) << dendl;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  *empty = ls.empty();
+  tracepoint(objectstore, collection_empty_exit, *empty);
+  return 0;
+}
+
+int FileStore::_collection_set_bits(const coll_t& c, int bits)
+{
+  char fn[PATH_MAX];
+  get_cdir(c, fn, sizeof(fn));
+  dout(10) << __FUNC__ << ": " << fn << " " << bits << dendl;
+  char n[PATH_MAX];
+  int r;
+  int32_t v = bits;
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    r = -errno;
+    goto out;
+  }
+  get_attrname("bits", n, PATH_MAX);
+  r = chain_fsetxattr(fd, n, (char*)&v, sizeof(v));
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+  dout(10) << __FUNC__ << ": " << fn << " " << bits << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::collection_bits(CollectionHandle& ch)
+{
+  char fn[PATH_MAX];
+  get_cdir(ch->cid, fn, sizeof(fn));
+  dout(15) << __FUNC__ << ": " << fn << dendl;
+  int r;
+  char n[PATH_MAX];
+  int32_t bits;
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    bits = r = -errno;
+    goto out;
+  }
+  get_attrname("bits", n, PATH_MAX);
+  r = chain_fgetxattr(fd, n, (char*)&bits, sizeof(bits));
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  if (r < 0) {
+    bits = r;
+    goto out;
+  }
+ out:
+  dout(10) << __FUNC__ << ": " << fn << " = " << bits << dendl;
+  return bits;
+}
+
+int FileStore::collection_list(const coll_t& c,
+			       const ghobject_t& orig_start,
+			       const ghobject_t& end,
+			       int max,
+			       vector<ghobject_t> *ls, ghobject_t *next)
+{
+  ghobject_t start = orig_start;
+  if (start.is_max())
+    return 0;
+
+  ghobject_t temp_next;
+  if (!next)
+    next = &temp_next;
+  // figure out the pool id.  we need this in order to generate a
+  // meaningful 'next' value.
+  int64_t pool = -1;
+  shard_id_t shard;
+  {
+    spg_t pgid;
+    if (c.is_temp(&pgid)) {
+      pool = -2 - pgid.pool();
+      shard = pgid.shard;
+    } else if (c.is_pg(&pgid)) {
+      pool = pgid.pool();
+      shard = pgid.shard;
+    } else if (c.is_meta()) {
+      pool = -1;
+      shard = shard_id_t::NO_SHARD;
+    } else {
+      // hrm, the caller is test code!  we should get kill it off.  for now,
+      // tolerate it.
+      pool = 0;
+      shard = shard_id_t::NO_SHARD;
+    }
+    dout(20) << __FUNC__ << ": pool is " << pool << " shard is " << shard
+	     << " pgid " << pgid << dendl;
+  }
+  ghobject_t sep;
+  sep.hobj.pool = -1;
+  sep.set_shard(shard);
+  if (!c.is_temp() && !c.is_meta()) {
+    if (start < sep) {
+      dout(10) << __FUNC__ << ": first checking temp pool" << dendl;
+      coll_t temp = c.get_temp();
+      int r = collection_list(temp, start, end, max, ls, next);
+      if (r < 0)
+	return r;
+      if (*next != ghobject_t::get_max())
+	return r;
+      start = sep;
+      dout(10) << __FUNC__ << ": fall through to non-temp collection, start "
+	       << start << dendl;
+    } else {
+      dout(10) << __FUNC__ << ": start " << start << " >= sep " << sep << dendl;
+    }
+  }
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+
+  ceph_assert(index.index);
+  RWLock::RLocker l((index.index)->access_lock);
+
+  r = index->collection_list_partial(start, end, max, ls, next);
+
+  if (r < 0) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  dout(20) << "objects: " << *ls << dendl;
+
+  // HashIndex doesn't know the pool when constructing a 'next' value
+  if (!next->is_max()) {
+    next->hobj.pool = pool;
+    next->set_shard(shard);
+    dout(20) << "  next " << *next << dendl;
+  }
+
+  return 0;
+}
+
+int FileStore::omap_get(CollectionHandle& ch, const ghobject_t &hoid,
+			bufferlist *header,
+			map<string, bufferlist> *out)
+{
+  tracepoint(objectstore, omap_get_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->get(hoid, header, out);
+  if (r < 0 && r != -ENOENT) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  tracepoint(objectstore, omap_get_exit, 0);
+  return 0;
+}
+
+int FileStore::omap_get_header(
+  CollectionHandle& ch,
+  const ghobject_t &hoid,
+  bufferlist *bl,
+  bool allow_eio)
+{
+  tracepoint(objectstore, omap_get_header_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->get_header(hoid, bl);
+  if (r < 0 && r != -ENOENT) {
+    ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+    return r;
+  }
+  tracepoint(objectstore, omap_get_header_exit, 0);
+  return 0;
+}
+
+int FileStore::omap_get_keys(CollectionHandle& ch, const ghobject_t &hoid, set<string> *keys)
+{
+  tracepoint(objectstore, omap_get_keys_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->get_keys(hoid, keys);
+  if (r < 0 && r != -ENOENT) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  tracepoint(objectstore, omap_get_keys_exit, 0);
+  return 0;
+}
+
+int FileStore::omap_get_values(CollectionHandle& ch, const ghobject_t &hoid,
+			       const set<string> &keys,
+			       map<string, bufferlist> *out)
+{
+  tracepoint(objectstore, omap_get_values_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  const char *where = "()";
+  int r = get_index(c, &index);
+  if (r < 0) {
+    where = " (get_index)";
+    goto out;
+  }
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0) {
+      where = " (lfn_find)";
+      goto out;
+    }
+  }
+  r = object_map->get_values(hoid, keys, out);
+  if (r < 0 && r != -ENOENT) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    where = " (get_values)";
+    goto out;
+  }
+  r = 0;
+ out:
+  tracepoint(objectstore, omap_get_values_exit, r);
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << " = " << r
+	   << where << dendl;
+  return r;
+}
+
+int FileStore::omap_check_keys(CollectionHandle& ch, const ghobject_t &hoid,
+			       const set<string> &keys,
+			       set<string> *out)
+{
+  tracepoint(objectstore, omap_check_keys_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->check_keys(hoid, keys, out);
+  if (r < 0 && r != -ENOENT) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  tracepoint(objectstore, omap_check_keys_exit, 0);
+  return 0;
+}
+
+ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(
+  CollectionHandle& ch,
+  const ghobject_t &oid)
+{
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+  return get_omap_iterator(ch->cid, oid);
+}
+
+ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(const coll_t& _c,
+							  const ghobject_t &hoid)
+{
+  tracepoint(objectstore, get_omap_iterator, _c.c_str());
+  const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 "
+	     << "(get_index failed with " << cpp_strerror(r) << ")" << dendl;
+    return ObjectMap::ObjectMapIterator();
+  }
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 "
+	       << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl;
+      return ObjectMap::ObjectMapIterator();
+    }
+  }
+  return object_map->get_iterator(hoid);
+}
+
+int FileStore::_collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
+    uint64_t expected_num_objs,
+    const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": collection: " << c << " pg number: "
+     << pg_num << " expected number of objects: " << expected_num_objs << dendl;
+
+  bool empty;
+  int ret = collection_empty(c, &empty);
+  if (ret < 0)
+    return ret;
+  if (!empty && !replaying) {
+    dout(0) << "Failed to give an expected number of objects hint to collection : "
+      << c << ", only empty collection can take such type of hint. " << dendl;
+    return 0;
+  }
+
+  Index index;
+  ret = get_index(c, &index);
+  if (ret < 0)
+    return ret;
+  // Pre-hash the collection
+  ret = index->pre_hash_collection(pg_num, expected_num_objs);
+  dout(10) << "pre_hash_collection " << c << " = " << ret << dendl;
+  if (ret < 0)
+    return ret;
+  _set_replay_guard(c, spos);
+
+  return 0;
+}
+
+int FileStore::_create_collection(
+  const coll_t& c,
+  int bits,
+  const SequencerPosition &spos)
+{
+  char fn[PATH_MAX];
+  get_cdir(c, fn, sizeof(fn));
+  dout(15) << __FUNC__ << ": " << fn << dendl;
+  int r = ::mkdir(fn, 0755);
+  if (r < 0)
+    r = -errno;
+  if (r == -EEXIST && replaying)
+    r = 0;
+  dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+
+  if (r < 0)
+    return r;
+  r = init_index(c);
+  if (r < 0)
+    return r;
+  r = _collection_set_bits(c, bits);
+  if (r < 0)
+    return r;
+  // create parallel temp collection, too
+  if (!c.is_meta() && !c.is_temp()) {
+    coll_t temp = c.get_temp();
+    r = _create_collection(temp, 0, spos);
+    if (r < 0)
+      return r;
+  }
+
+  _set_replay_guard(c, spos);
+  return 0;
+}
+
+int FileStore::_destroy_collection(const coll_t& c)
+{
+  int r = 0;
+  char fn[PATH_MAX];
+  get_cdir(c, fn, sizeof(fn));
+  dout(15) << __FUNC__ << ": " << fn << dendl;
+  {
+    Index from;
+    r = get_index(c, &from);
+    if (r < 0)
+      goto out;
+    ceph_assert(from.index);
+    RWLock::WLocker l((from.index)->access_lock);
+
+    r = from->prep_delete();
+    if (r < 0)
+      goto out;
+  }
+  r = ::rmdir(fn);
+  if (r < 0) {
+    r = -errno;
+    goto out;
+  }
+
+ out:
+  // destroy parallel temp collection, too
+  if (!c.is_meta() && !c.is_temp()) {
+    coll_t temp = c.get_temp();
+    int r2 = _destroy_collection(temp);
+    if (r2 < 0) {
+      r = r2;
+      goto out_final;
+    }
+  }
+
+ out_final:
+  dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+  return r;
+}
+
+
+int FileStore::_collection_add(const coll_t& c, const coll_t& oldcid, const ghobject_t& o,
+			       const SequencerPosition& spos)
+{
+  dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << dendl;
+
+  int dstcmp = _check_replay_guard(c, o, spos);
+  if (dstcmp < 0)
+    return 0;
+
+  // check the src name too; it might have a newer guard, and we don't
+  // want to clobber it
+  int srccmp = _check_replay_guard(oldcid, o, spos);
+  if (srccmp < 0)
+    return 0;
+
+  // open guard on object so we don't any previous operations on the
+  // new name that will modify the source inode.
+  FDRef fd;
+  int r = lfn_open(oldcid, o, 0, &fd);
+  if (r < 0) {
+    // the source collection/object does not exist. If we are replaying, we
+    // should be safe, so just return 0 and move on.
+    ceph_assert(replaying);
+    dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+	     << oldcid << "/" << o << " (dne, continue replay) " << dendl;
+    return 0;
+  }
+  if (dstcmp > 0) {      // if dstcmp == 0 the guard already says "in-progress"
+    _set_replay_guard(**fd, spos, &o, true);
+  }
+
+  r = lfn_link(oldcid, c, o, o);
+  if (replaying && !backend->can_checkpoint() &&
+      r == -EEXIST)    // crashed between link() and set_replay_guard()
+    r = 0;
+
+  _inject_failure();
+
+  // close guard on object so we don't do this again
+  if (r == 0) {
+    _close_replay_guard(**fd, spos);
+  }
+  lfn_close(fd);
+
+  dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+				       coll_t c, const ghobject_t& o,
+				       const SequencerPosition& spos,
+				       bool allow_enoent)
+{
+  dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl;
+  int r = 0;
+  int dstcmp, srccmp;
+
+  if (replaying) {
+    /* If the destination collection doesn't exist during replay,
+     * we need to delete the src object and continue on
+     */
+    if (!collection_exists(c))
+      goto out_rm_src;
+  }
+
+  dstcmp = _check_replay_guard(c, o, spos);
+  if (dstcmp < 0)
+    goto out_rm_src;
+
+  // check the src name too; it might have a newer guard, and we don't
+  // want to clobber it
+  srccmp = _check_replay_guard(oldcid, oldoid, spos);
+  if (srccmp < 0)
+    return 0;
+
+  {
+    // open guard on object so we don't any previous operations on the
+    // new name that will modify the source inode.
+    FDRef fd;
+    r = lfn_open(oldcid, oldoid, 0, &fd);
+    if (r < 0) {
+      // the source collection/object does not exist. If we are replaying, we
+      // should be safe, so just return 0 and move on.
+      if (replaying) {
+	dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+		 << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl;
+      } else if (allow_enoent) {
+	dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+		 << oldcid << "/" << oldoid << " (dne, ignoring enoent)"
+		 << dendl;
+      } else {
+	ceph_abort_msg("ERROR: source must exist");
+      }
+
+      if (!replaying) {
+	return 0;
+      }
+      if (allow_enoent && dstcmp > 0) { // if dstcmp == 0, try_rename was started.
+	return 0;
+      }
+
+      r = 0; // don't know if object_map was cloned
+    } else {
+      if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
+	_set_replay_guard(**fd, spos, &o, true);
+      }
+
+      r = lfn_link(oldcid, c, oldoid, o);
+      if (replaying && !backend->can_checkpoint() &&
+	  r == -EEXIST)    // crashed between link() and set_replay_guard()
+	r = 0;
+
+      lfn_close(fd);
+      fd = FDRef();
+
+      _inject_failure();
+    }
+
+    if (r == 0) {
+      // the name changed; link the omap content
+      r = object_map->rename(oldoid, o, &spos);
+      if (r == -ENOENT)
+	r = 0;
+    }
+
+    _inject_failure();
+
+    if (r == 0)
+      r = lfn_unlink(oldcid, oldoid, spos, true);
+
+    if (r == 0)
+      r = lfn_open(c, o, 0, &fd);
+
+    // close guard on object so we don't do this again
+    if (r == 0) {
+      _close_replay_guard(**fd, spos, &o);
+      lfn_close(fd);
+    }
+  }
+
+  dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid
+	   << " = " << r << dendl;
+  return r;
+
+ out_rm_src:
+  // remove source
+  if (_check_replay_guard(oldcid, oldoid, spos) > 0) {
+    r = lfn_unlink(oldcid, oldoid, spos, true);
+  }
+
+  dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid
+	   << " = " << r << dendl;
+  return r;
+}
+
+void FileStore::_inject_failure()
+{
+  if (m_filestore_kill_at) {
+    int final = --m_filestore_kill_at;
+    dout(5) << __FUNC__ << ": " << (final+1) << " -> " << final << dendl;
+    if (final == 0) {
+      derr << __FUNC__ << ": KILLING" << dendl;
+      cct->_log->flush();
+      _exit(1);
+    }
+  }
+}
+
+int FileStore::_omap_clear(const coll_t& cid, const ghobject_t &hoid,
+			   const SequencerPosition &spos) {
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->clear_keys_header(hoid, &spos);
+  if (r < 0 && r != -ENOENT)
+    return r;
+  return 0;
+}
+
+int FileStore::_omap_setkeys(const coll_t& cid, const ghobject_t &hoid,
+			     const map<string, bufferlist> &aset,
+			     const SequencerPosition &spos) {
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+  Index index;
+  int r;
+  //treat pgmeta as a logical object, skip to check exist
+  if (hoid.is_pgmeta())
+    goto skip;
+
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(20) << __FUNC__ << ": get_index got " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0) {
+      dout(20) << __FUNC__ << ": lfn_find got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+skip:
+  if (g_conf()->subsys.should_gather<ceph_subsys_filestore, 20>()) {
+    for (auto& p : aset) {
+      dout(20) << __FUNC__ << ":  set " << p.first << dendl;
+    }
+  }
+  r = object_map->set_keys(hoid, aset, &spos);
+  dout(20) << __FUNC__ << ": " << cid << "/" << hoid << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &hoid,
+			    const set<string> &keys,
+			    const SequencerPosition &spos) {
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+  Index index;
+  int r;
+  //treat pgmeta as a logical object, skip to check exist
+  if (hoid.is_pgmeta())
+    goto skip;
+
+  r = get_index(cid, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+skip:
+  r = object_map->rm_keys(hoid, keys, &spos);
+  if (r < 0 && r != -ENOENT)
+    return r;
+  return 0;
+}
+
+int FileStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &hoid,
+				const string& first, const string& last,
+				const SequencerPosition &spos) {
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl;
+  set<string> keys;
+  {
+    ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid);
+    if (!iter)
+      return -ENOENT;
+    for (iter->lower_bound(first); iter->valid() && iter->key() < last;
+	 iter->next()) {
+      keys.insert(iter->key());
+    }
+  }
+  return _omap_rmkeys(cid, hoid, keys, spos);
+}
+
+int FileStore::_omap_setheader(const coll_t& cid, const ghobject_t &hoid,
+			       const bufferlist &bl,
+			       const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    RWLock::RLocker l((index.index)->access_lock);
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  return object_map->set_header(hoid, bl, &spos);
+}
+
+int FileStore::_merge_collection(const coll_t& cid,
+				 uint32_t bits,
+				 coll_t dest,
+				 const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << " " << dest
+	   << " bits " << bits << dendl;
+  int r = 0;
+
+  if (!collection_exists(cid)) {
+    dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl;
+    ceph_assert(replaying);
+    return 0;
+  }
+  if (!collection_exists(dest)) {
+    dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl;
+    ceph_assert(replaying);
+    return 0;
+  }
+
+  // set bits
+  if (_check_replay_guard(cid, spos) > 0)
+    _collection_set_bits(dest, bits);
+
+  spg_t pgid;
+  bool is_pg = dest.is_pg(&pgid);
+  ceph_assert(is_pg);
+
+  int dstcmp = _check_replay_guard(dest, spos);
+  if (dstcmp < 0)
+    return 0;
+
+  int srccmp = _check_replay_guard(cid, spos);
+  if (srccmp < 0)
+    return 0;
+
+  _set_global_replay_guard(cid, spos);
+  _set_replay_guard(cid, spos, true);
+  _set_replay_guard(dest, spos, true);
+
+  // main collection
+  {
+    Index from;
+    r = get_index(cid, &from);
+
+    Index to;
+    if (!r)
+      r = get_index(dest, &to);
+
+    if (!r) {
+      ceph_assert(from.index);
+      RWLock::WLocker l1((from.index)->access_lock);
+
+      ceph_assert(to.index);
+      RWLock::WLocker l2((to.index)->access_lock);
+
+      r = from->merge(bits, to.index);
+    }
+  }
+
+  // temp too
+  {
+    Index from;
+    r = get_index(cid.get_temp(), &from);
+
+    Index to;
+    if (!r)
+      r = get_index(dest.get_temp(), &to);
+
+    if (!r) {
+      ceph_assert(from.index);
+      RWLock::WLocker l1((from.index)->access_lock);
+
+      ceph_assert(to.index);
+      RWLock::WLocker l2((to.index)->access_lock);
+
+      r = from->merge(bits, to.index);
+    }
+  }
+
+  // remove source
+  _destroy_collection(cid);
+
+  _close_replay_guard(dest, spos);
+  _close_replay_guard(dest.get_temp(), spos);
+  // no need to close guards on cid... it's removed.
+
+  if (!r && cct->_conf->filestore_debug_verify_split) {
+    vector<ghobject_t> objects;
+    ghobject_t next;
+    while (1) {
+      collection_list(
+	dest,
+	next, ghobject_t::get_max(),
+	get_ideal_list_max(),
+	&objects,
+	&next);
+      if (objects.empty())
+	break;
+      for (vector<ghobject_t>::iterator i = objects.begin();
+	   i != objects.end();
+	   ++i) {
+	if (!i->match(bits, pgid.pgid.ps())) {
+	  dout(20) << __FUNC__ << ": " << *i << " does not belong in "
+		   << cid << dendl;
+	  ceph_assert(i->match(bits, pgid.pgid.ps()));
+	}
+      }
+      objects.clear();
+    }
+  }
+
+  dout(15) << __FUNC__ << ": " << cid << " " << dest << " bits " << bits
+	   << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_split_collection(const coll_t& cid,
+				 uint32_t bits,
+				 uint32_t rem,
+				 coll_t dest,
+				 const SequencerPosition &spos)
+{
+  int r;
+  {
+    dout(15) << __FUNC__ << ": " << cid << " bits: " << bits << dendl;
+    if (!collection_exists(cid)) {
+      dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl;
+      ceph_assert(replaying);
+      return 0;
+    }
+    if (!collection_exists(dest)) {
+      dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl;
+      ceph_assert(replaying);
+      return 0;
+    }
+
+    int dstcmp = _check_replay_guard(dest, spos);
+    if (dstcmp < 0)
+      return 0;
+
+    int srccmp = _check_replay_guard(cid, spos);
+    if (srccmp < 0)
+      return 0;
+
+    _set_global_replay_guard(cid, spos);
+    _set_replay_guard(cid, spos, true);
+    _set_replay_guard(dest, spos, true);
+
+    Index from;
+    r = get_index(cid, &from);
+
+    Index to;
+    if (!r)
+      r = get_index(dest, &to);
+
+    if (!r) {
+      ceph_assert(from.index);
+      RWLock::WLocker l1((from.index)->access_lock);
+
+      ceph_assert(to.index);
+      RWLock::WLocker l2((to.index)->access_lock);
+
+      r = from->split(rem, bits, to.index);
+    }
+
+    _close_replay_guard(cid, spos);
+    _close_replay_guard(dest, spos);
+  }
+  _collection_set_bits(cid, bits);
+  if (!r && cct->_conf->filestore_debug_verify_split) {
+    vector<ghobject_t> objects;
+    ghobject_t next;
+    while (1) {
+      collection_list(
+	cid,
+	next, ghobject_t::get_max(),
+	get_ideal_list_max(),
+	&objects,
+	&next);
+      if (objects.empty())
+	break;
+      for (vector<ghobject_t>::iterator i = objects.begin();
+	   i != objects.end();
+	   ++i) {
+	dout(20) << __FUNC__ << ": " << *i << " still in source "
+		 << cid << dendl;
+	ceph_assert(!i->match(bits, rem));
+      }
+      objects.clear();
+    }
+    next = ghobject_t();
+    while (1) {
+      collection_list(
+	dest,
+	next, ghobject_t::get_max(),
+	get_ideal_list_max(),
+	&objects,
+	&next);
+      if (objects.empty())
+	break;
+      for (vector<ghobject_t>::iterator i = objects.begin();
+	   i != objects.end();
+	   ++i) {
+	dout(20) << __FUNC__ << ": " << *i << " now in dest "
+		 << *i << dendl;
+	ceph_assert(i->match(bits, rem));
+      }
+      objects.clear();
+    }
+  }
+  return r;
+}
+
+int FileStore::_set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
+                               uint64_t expected_object_size,
+                               uint64_t expected_write_size)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl;
+
+  FDRef fd;
+  int ret = 0;
+
+  if (expected_object_size == 0 || expected_write_size == 0)
+    goto out;
+
+  ret = lfn_open(cid, oid, false, &fd);
+  if (ret < 0)
+    goto out;
+
+  {
+    // TODO: a more elaborate hint calculation
+    uint64_t hint = std::min<uint64_t>(expected_write_size, m_filestore_max_alloc_hint_size);
+
+    ret = backend->set_alloc_hint(**fd, hint);
+    dout(20) << __FUNC__ << ": hint " << hint << " ret " << ret << dendl;
+  }
+
+  lfn_close(fd);
+out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl;
+  ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+  return ret;
+}
+
+const char** FileStore::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "filestore_max_inline_xattr_size",
+    "filestore_max_inline_xattr_size_xfs",
+    "filestore_max_inline_xattr_size_btrfs",
+    "filestore_max_inline_xattr_size_other",
+    "filestore_max_inline_xattrs",
+    "filestore_max_inline_xattrs_xfs",
+    "filestore_max_inline_xattrs_btrfs",
+    "filestore_max_inline_xattrs_other",
+    "filestore_max_xattr_value_size",
+    "filestore_max_xattr_value_size_xfs",
+    "filestore_max_xattr_value_size_btrfs",
+    "filestore_max_xattr_value_size_other",
+    "filestore_min_sync_interval",
+    "filestore_max_sync_interval",
+    "filestore_queue_max_ops",
+    "filestore_queue_max_bytes",
+    "filestore_expected_throughput_bytes",
+    "filestore_expected_throughput_ops",
+    "filestore_queue_low_threshhold",
+    "filestore_queue_high_threshhold",
+    "filestore_queue_high_delay_multiple",
+    "filestore_queue_max_delay_multiple",
+    "filestore_commit_timeout",
+    "filestore_dump_file",
+    "filestore_kill_at",
+    "filestore_fail_eio",
+    "filestore_fadvise",
+    "filestore_sloppy_crc",
+    "filestore_sloppy_crc_block_size",
+    "filestore_max_alloc_hint_size",
+    NULL
+  };
+  return KEYS;
+}
+
+void FileStore::handle_conf_change(const ConfigProxy& conf,
+			  const std::set <std::string> &changed)
+{
+  if (changed.count("filestore_max_inline_xattr_size") ||
+      changed.count("filestore_max_inline_xattr_size_xfs") ||
+      changed.count("filestore_max_inline_xattr_size_btrfs") ||
+      changed.count("filestore_max_inline_xattr_size_other") ||
+      changed.count("filestore_max_inline_xattrs") ||
+      changed.count("filestore_max_inline_xattrs_xfs") ||
+      changed.count("filestore_max_inline_xattrs_btrfs") ||
+      changed.count("filestore_max_inline_xattrs_other") ||
+      changed.count("filestore_max_xattr_value_size") ||
+      changed.count("filestore_max_xattr_value_size_xfs") ||
+      changed.count("filestore_max_xattr_value_size_btrfs") ||
+      changed.count("filestore_max_xattr_value_size_other")) {
+    if (backend) {
+      Mutex::Locker l(lock);
+      set_xattr_limits_via_conf();
+    }
+  }
+
+  if (changed.count("filestore_queue_max_bytes") ||
+      changed.count("filestore_queue_max_ops") ||
+      changed.count("filestore_expected_throughput_bytes") ||
+      changed.count("filestore_expected_throughput_ops") ||
+      changed.count("filestore_queue_low_threshhold") ||
+      changed.count("filestore_queue_high_threshhold") ||
+      changed.count("filestore_queue_high_delay_multiple") ||
+      changed.count("filestore_queue_max_delay_multiple")) {
+    Mutex::Locker l(lock);
+    set_throttle_params();
+  }
+
+  if (changed.count("filestore_min_sync_interval") ||
+      changed.count("filestore_max_sync_interval") ||
+      changed.count("filestore_kill_at") ||
+      changed.count("filestore_fail_eio") ||
+      changed.count("filestore_sloppy_crc") ||
+      changed.count("filestore_sloppy_crc_block_size") ||
+      changed.count("filestore_max_alloc_hint_size") ||
+      changed.count("filestore_fadvise")) {
+    Mutex::Locker l(lock);
+    m_filestore_min_sync_interval = conf->filestore_min_sync_interval;
+    m_filestore_max_sync_interval = conf->filestore_max_sync_interval;
+    m_filestore_kill_at = conf->filestore_kill_at;
+    m_filestore_fail_eio = conf->filestore_fail_eio;
+    m_filestore_fadvise = conf->filestore_fadvise;
+    m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
+    m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
+    m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size;
+  }
+  if (changed.count("filestore_commit_timeout")) {
+    Mutex::Locker l(sync_entry_timeo_lock);
+    m_filestore_commit_timeout = conf->filestore_commit_timeout;
+  }
+  if (changed.count("filestore_dump_file")) {
+    if (conf->filestore_dump_file.length() &&
+	conf->filestore_dump_file != "-") {
+      dump_start(conf->filestore_dump_file);
+    } else {
+      dump_stop();
+    }
+  }
+}
+
+int FileStore::set_throttle_params()
+{
+  stringstream ss;
+  bool valid = throttle_bytes.set_params(
+    cct->_conf->filestore_queue_low_threshhold,
+    cct->_conf->filestore_queue_high_threshhold,
+    cct->_conf->filestore_expected_throughput_bytes,
+    cct->_conf->filestore_queue_high_delay_multiple?
+    cct->_conf->filestore_queue_high_delay_multiple:
+    cct->_conf->filestore_queue_high_delay_multiple_bytes,
+    cct->_conf->filestore_queue_max_delay_multiple?
+    cct->_conf->filestore_queue_max_delay_multiple:
+    cct->_conf->filestore_queue_max_delay_multiple_bytes,
+    cct->_conf->filestore_queue_max_bytes,
+    &ss);
+
+  valid &= throttle_ops.set_params(
+    cct->_conf->filestore_queue_low_threshhold,
+    cct->_conf->filestore_queue_high_threshhold,
+    cct->_conf->filestore_expected_throughput_ops,
+    cct->_conf->filestore_queue_high_delay_multiple?
+    cct->_conf->filestore_queue_high_delay_multiple:
+    cct->_conf->filestore_queue_high_delay_multiple_ops,
+    cct->_conf->filestore_queue_max_delay_multiple?
+    cct->_conf->filestore_queue_max_delay_multiple:
+    cct->_conf->filestore_queue_max_delay_multiple_ops,
+    cct->_conf->filestore_queue_max_ops,
+    &ss);
+
+  logger->set(l_filestore_op_queue_max_ops, throttle_ops.get_max());
+  logger->set(l_filestore_op_queue_max_bytes, throttle_bytes.get_max());
+
+  if (!valid) {
+    derr << "tried to set invalid params: "
+	 << ss.str()
+	 << dendl;
+  }
+  return valid ? 0 : -EINVAL;
+}
+
+void FileStore::dump_start(const std::string& file)
+{
+  dout(10) << __FUNC__ << ": " << file << dendl;
+  if (m_filestore_do_dump) {
+    dump_stop();
+  }
+  m_filestore_dump_fmt.reset();
+  m_filestore_dump_fmt.open_array_section("dump");
+  m_filestore_dump.open(file.c_str());
+  m_filestore_do_dump = true;
+}
+
+void FileStore::dump_stop()
+{
+  dout(10) << __FUNC__ << dendl;
+  m_filestore_do_dump = false;
+  if (m_filestore_dump.is_open()) {
+    m_filestore_dump_fmt.close_section();
+    m_filestore_dump_fmt.flush(m_filestore_dump);
+    m_filestore_dump.flush();
+    m_filestore_dump.close();
+  }
+}
+
+void FileStore::dump_transactions(vector<ObjectStore::Transaction>& ls, uint64_t seq, OpSequencer *osr)
+{
+  m_filestore_dump_fmt.open_array_section("transactions");
+  unsigned trans_num = 0;
+  for (vector<ObjectStore::Transaction>::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) {
+    m_filestore_dump_fmt.open_object_section("transaction");
+    m_filestore_dump_fmt.dump_stream("osr") << osr->cid;
+    m_filestore_dump_fmt.dump_unsigned("seq", seq);
+    m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num);
+    (*i).dump(&m_filestore_dump_fmt);
+    m_filestore_dump_fmt.close_section();
+  }
+  m_filestore_dump_fmt.close_section();
+  m_filestore_dump_fmt.flush(m_filestore_dump);
+  m_filestore_dump.flush();
+}
+
+void FileStore::get_db_statistics(Formatter* f)
+{
+  object_map->db->get_statistics(f);
+}
+
+void FileStore::set_xattr_limits_via_conf()
+{
+  uint32_t fs_xattr_size;
+  uint32_t fs_xattrs;
+  uint32_t fs_xattr_max_value_size;
+
+  switch (m_fs_type) {
+#if defined(__linux__)
+  case XFS_SUPER_MAGIC:
+    fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_xfs;
+    fs_xattrs = cct->_conf->filestore_max_inline_xattrs_xfs;
+    fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_xfs;
+    break;
+  case BTRFS_SUPER_MAGIC:
+    fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_btrfs;
+    fs_xattrs = cct->_conf->filestore_max_inline_xattrs_btrfs;
+    fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_btrfs;
+    break;
+#endif
+  default:
+    fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_other;
+    fs_xattrs = cct->_conf->filestore_max_inline_xattrs_other;
+    fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_other;
+    break;
+  }
+
+  // Use override value if set
+  if (cct->_conf->filestore_max_inline_xattr_size)
+    m_filestore_max_inline_xattr_size = cct->_conf->filestore_max_inline_xattr_size;
+  else
+    m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+  // Use override value if set
+  if (cct->_conf->filestore_max_inline_xattrs)
+    m_filestore_max_inline_xattrs = cct->_conf->filestore_max_inline_xattrs;
+  else
+    m_filestore_max_inline_xattrs = fs_xattrs;
+
+  // Use override value if set
+  if (cct->_conf->filestore_max_xattr_value_size)
+    m_filestore_max_xattr_value_size = cct->_conf->filestore_max_xattr_value_size;
+  else
+    m_filestore_max_xattr_value_size = fs_xattr_max_value_size;
+
+  if (m_filestore_max_xattr_value_size < cct->_conf->osd_max_object_name_len) {
+    derr << "WARNING: max attr value size ("
+	 << m_filestore_max_xattr_value_size
+	 << ") is smaller than osd_max_object_name_len ("
+	 << cct->_conf->osd_max_object_name_len
+	 << ").  Your backend filesystem appears to not support attrs large "
+	 << "enough to handle the configured max rados name size.  You may get "
+	 << "unexpected ENAMETOOLONG errors on rados operations or buggy "
+	 << "behavior"
+	 << dendl;
+  }
+}
+
+uint64_t FileStore::estimate_objects_overhead(uint64_t num_objects)
+{
+  uint64_t res = num_objects * blk_size / 2; //assumes that each object uses ( in average ) additional 1/2 block due to FS allocation granularity.
+  return res;
+}
+
+int FileStore::apply_layout_settings(const coll_t &cid, int target_level)
+{
+  dout(20) << __FUNC__ << ": " << cid << " target level: " 
+           << target_level << dendl;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << "Error getting index for " << cid << ": " << cpp_strerror(r)
+	     << dendl;
+    return r;
+  }
+
+  return index->apply_layout_settings(target_level);
+}
+
+
+// -- FSSuperblock --
+
+void FSSuperblock::encode(bufferlist &bl) const
+{
+  ENCODE_START(2, 1, bl);
+  compat_features.encode(bl);
+  encode(omap_backend, bl);
+  ENCODE_FINISH(bl);
+}
+
+void FSSuperblock::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(2, bl);
+  compat_features.decode(bl);
+  if (struct_v >= 2)
+    decode(omap_backend, bl);
+  else
+    omap_backend = "leveldb";
+  DECODE_FINISH(bl);
+}
+
+void FSSuperblock::dump(Formatter *f) const
+{
+  f->open_object_section("compat");
+  compat_features.dump(f);
+  f->dump_string("omap_backend", omap_backend);
+  f->close_section();
+}
+
+void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o)
+{
+  FSSuperblock z;
+  o.push_back(new FSSuperblock(z));
+  CompatSet::FeatureSet feature_compat;
+  CompatSet::FeatureSet feature_ro_compat;
+  CompatSet::FeatureSet feature_incompat;
+  feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+  z.compat_features = CompatSet(feature_compat, feature_ro_compat,
+                                feature_incompat);
+  o.push_back(new FSSuperblock(z));
+  z.omap_backend = "rocksdb";
+  o.push_back(new FSSuperblock(z));
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore.osr(" << this << ") "
+
+void FileStore::OpSequencer::_register_apply(Op *o)
+{
+  if (o->registered_apply) {
+    dout(20) << __func__ << " " << o << " already registered" << dendl;
+    return;
+  }
+  o->registered_apply = true;
+  for (auto& t : o->tls) {
+    for (auto& i : t.get_object_index()) {
+      uint32_t key = i.first.hobj.get_hash();
+      applying.emplace(make_pair(key, &i.first));
+      dout(20) << __func__ << " " << o << " " << i.first << " ("
+	       << &i.first << ")" << dendl;
+    }
+  }
+}
+
+void FileStore::OpSequencer::_unregister_apply(Op *o)
+{
+  ceph_assert(o->registered_apply);
+  for (auto& t : o->tls) {
+    for (auto& i : t.get_object_index()) {
+      uint32_t key = i.first.hobj.get_hash();
+      auto p = applying.find(key);
+      bool removed = false;
+      while (p != applying.end() &&
+	     p->first == key) {
+	if (p->second == &i.first) {
+	  dout(20) << __func__ << " " << o << " " << i.first << " ("
+		   << &i.first << ")" << dendl;
+	  applying.erase(p);
+	  removed = true;
+	  break;
+	}
+	++p;
+      }
+      ceph_assert(removed);
+    }
+  }
+}
+
+void FileStore::OpSequencer::wait_for_apply(const ghobject_t& oid)
+{
+  Mutex::Locker l(qlock);
+  uint32_t key = oid.hobj.get_hash();
+retry:
+  while (true) {
+    // search all items in hash slot for a matching object
+    auto p = applying.find(key);
+    while (p != applying.end() &&
+	   p->first == key) {
+      if (*p->second == oid) {
+	dout(20) << __func__ << " " << oid << " waiting on " << p->second
+		 << dendl;
+	cond.Wait(qlock);
+	goto retry;
+      }
+      ++p;
+    }
+    break;
+  }
+  dout(20) << __func__ << " " << oid << " done" << dendl;
+}
diff --git a/src/os/filestore/FileStore.h b/src/os/filestore/FileStore.h
new file mode 100644
index 00000000..e09b9e04
--- /dev/null
+++ b/src/os/filestore/FileStore.h
@@ -0,0 +1,938 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILESTORE_H
+#define CEPH_FILESTORE_H
+
+#include "include/types.h"
+
+#include <map>
+#include <deque>
+#include <atomic>
+#include <fstream>
+
+
+#include <boost/scoped_ptr.hpp>
+
+#include "include/unordered_map.h"
+
+#include "include/ceph_assert.h"
+
+#include "os/ObjectStore.h"
+#include "JournalingObjectStore.h"
+
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "common/perf_counters.h"
+#include "common/zipkin_trace.h"
+
+#include "common/Mutex.h"
+#include "HashIndex.h"
+#include "IndexManager.h"
+#include "os/ObjectMap.h"
+#include "SequencerPosition.h"
+#include "FDCache.h"
+#include "WBThrottle.h"
+
+#include "include/uuid.h"
+
+#if defined(__linux__)
+# ifndef BTRFS_SUPER_MAGIC
+#define BTRFS_SUPER_MAGIC 0x9123683EUL
+# endif
+# ifndef XFS_SUPER_MAGIC
+#define XFS_SUPER_MAGIC 0x58465342UL
+# endif
+# ifndef ZFS_SUPER_MAGIC
+#define ZFS_SUPER_MAGIC 0x2fc12fc1UL
+# endif
+#endif
+
+
+class FileStoreBackend;
+
+#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
+
+enum {
+  l_filestore_first = 84000,
+  l_filestore_journal_queue_ops,
+  l_filestore_journal_queue_bytes,
+  l_filestore_journal_ops,
+  l_filestore_journal_bytes,
+  l_filestore_journal_latency,
+  l_filestore_journal_wr,
+  l_filestore_journal_wr_bytes,
+  l_filestore_journal_full,
+  l_filestore_committing,
+  l_filestore_commitcycle,
+  l_filestore_commitcycle_interval,
+  l_filestore_commitcycle_latency,
+  l_filestore_op_queue_max_ops,
+  l_filestore_op_queue_ops,
+  l_filestore_ops,
+  l_filestore_op_queue_max_bytes,
+  l_filestore_op_queue_bytes,
+  l_filestore_bytes,
+  l_filestore_apply_latency,
+  l_filestore_queue_transaction_latency_avg,
+  l_filestore_sync_pause_max_lat,
+  l_filestore_last,
+};
+
+class FSSuperblock {
+public:
+  CompatSet compat_features;
+  string omap_backend;
+
+  FSSuperblock() { }
+
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::const_iterator &bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<FSSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(FSSuperblock)
+
+inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
+{
+  return out << "sb(" << sb.compat_features << "): "
+             << sb.omap_backend;
+}
+
+class FileStore : public JournalingObjectStore,
+                  public md_config_obs_t
+{
+  static const uint32_t target_version = 4;
+public:
+  uint32_t get_target_version() {
+    return target_version;
+  }
+
+  static int get_block_device_fsid(CephContext* cct, const string& path,
+				   uuid_d *fsid);
+  struct FSPerfTracker {
+    PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
+    PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
+
+    objectstore_perf_stat_t get_cur_stats() const {
+      objectstore_perf_stat_t ret;
+      ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
+      ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
+      return ret;
+    }
+
+    void update_from_perfcounters(PerfCounters &logger);
+  } perf_tracker;
+  objectstore_perf_stat_t get_cur_stats() override {
+    perf_tracker.update_from_perfcounters(*logger);
+    return perf_tracker.get_cur_stats();
+  }
+  const PerfCounters* get_perf_counters() const override {
+    return logger;
+  }
+
+private:
+  string internal_name;         ///< internal name, used to name the perfcounter instance
+  string basedir, journalpath;
+  osflagbits_t generic_flags;
+  std::string current_fn;
+  std::string current_op_seq_fn;
+  std::string omap_dir;
+  uuid_d fsid;
+
+  size_t blk_size;            ///< fs block size
+
+  int fsid_fd, op_fd, basedir_fd, current_fd;
+
+  FileStoreBackend *backend;
+
+  void create_backend(unsigned long f_type);
+
+  string devname;
+
+  int vdo_fd = -1;
+  string vdo_name;
+
+  deque<uint64_t> snaps;
+
+  // Indexed Collections
+  IndexManager index_manager;
+  int get_index(const coll_t& c, Index *index);
+  int init_index(const coll_t& c);
+
+  bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) {
+    // - normal temp case: cid is pg, object is temp (pool < -1)
+    // - hammer temp case: cid is pg (or already temp), object pool is -1
+    return cid.is_pg() && oid.hobj.pool <= -1;
+  }
+  void init_temp_collections();
+
+  void handle_eio();
+
+  // ObjectMap
+  boost::scoped_ptr<ObjectMap> object_map;
+
+  // helper fns
+  int get_cdir(const coll_t& cid, char *s, int len);
+
+  /// read a uuid from fd
+  int read_fsid(int fd, uuid_d *uuid);
+
+  /// lock fsid_fd
+  int lock_fsid();
+
+  // sync thread
+  Mutex lock;
+  bool force_sync;
+  Cond sync_cond;
+
+  Mutex sync_entry_timeo_lock;
+  SafeTimer timer;
+
+  list<Context*> sync_waiters;
+  bool stop;
+  void sync_entry();
+  struct SyncThread : public Thread {
+    FileStore *fs;
+    explicit SyncThread(FileStore *f) : fs(f) {}
+    void *entry() override {
+      fs->sync_entry();
+      return 0;
+    }
+  } sync_thread;
+
+  // -- op workqueue --
+  struct Op {
+    utime_t start;
+    uint64_t op;
+    vector<Transaction> tls;
+    Context *onreadable, *onreadable_sync;
+    uint64_t ops, bytes;
+    TrackedOpRef osd_op;
+    ZTracer::Trace trace;
+    bool registered_apply = false;
+  };
+  class OpSequencer : public CollectionImpl {
+    CephContext *cct;
+    Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
+    list<Op*> q;
+    list<uint64_t> jq;
+    list<pair<uint64_t, Context*> > flush_commit_waiters;
+    Cond cond;
+    string osr_name_str;
+    /// hash of pointers to ghobject_t's for in-flight writes
+    unordered_multimap<uint32_t,const ghobject_t*> applying;
+  public:
+    Mutex apply_lock;  // for apply mutual exclusion
+    int id;
+    const char *osr_name;
+
+    /// get_max_uncompleted
+    bool _get_max_uncompleted(
+      uint64_t *seq ///< [out] max uncompleted seq
+      ) {
+      ceph_assert(qlock.is_locked());
+      ceph_assert(seq);
+      *seq = 0;
+      if (q.empty() && jq.empty())
+	return true;
+
+      if (!q.empty())
+	*seq = q.back()->op;
+      if (!jq.empty() && jq.back() > *seq)
+	*seq = jq.back();
+
+      return false;
+    } /// @returns true if both queues are empty
+
+    /// get_min_uncompleted
+    bool _get_min_uncompleted(
+      uint64_t *seq ///< [out] min uncompleted seq
+      ) {
+      ceph_assert(qlock.is_locked());
+      ceph_assert(seq);
+      *seq = 0;
+      if (q.empty() && jq.empty())
+	return true;
+
+      if (!q.empty())
+	*seq = q.front()->op;
+      if (!jq.empty() && jq.front() < *seq)
+	*seq = jq.front();
+
+      return false;
+    } /// @returns true if both queues are empty
+
+    void _wake_flush_waiters(list<Context*> *to_queue) {
+      uint64_t seq;
+      if (_get_min_uncompleted(&seq))
+	seq = -1;
+
+      for (list<pair<uint64_t, Context*> >::iterator i =
+	     flush_commit_waiters.begin();
+	   i != flush_commit_waiters.end() && i->first < seq;
+	   flush_commit_waiters.erase(i++)) {
+	to_queue->push_back(i->second);
+      }
+    }
+
+    void queue_journal(Op *o) {
+      Mutex::Locker l(qlock);
+      jq.push_back(o->op);
+      _register_apply(o);
+    }
+    void dequeue_journal(list<Context*> *to_queue) {
+      Mutex::Locker l(qlock);
+      jq.pop_front();
+      cond.Signal();
+      _wake_flush_waiters(to_queue);
+    }
+    void queue(Op *o) {
+      Mutex::Locker l(qlock);
+      q.push_back(o);
+      _register_apply(o);
+      o->trace.keyval("queue depth", q.size());
+    }
+    void _register_apply(Op *o);
+    void _unregister_apply(Op *o);
+    void wait_for_apply(const ghobject_t& oid);
+    Op *peek_queue() {
+      Mutex::Locker l(qlock);
+      ceph_assert(apply_lock.is_locked());
+      return q.front();
+    }
+
+    Op *dequeue(list<Context*> *to_queue) {
+      ceph_assert(to_queue);
+      ceph_assert(apply_lock.is_locked());
+      Mutex::Locker l(qlock);
+      Op *o = q.front();
+      q.pop_front();
+      cond.Signal();
+      _unregister_apply(o);
+      _wake_flush_waiters(to_queue);
+      return o;
+    }
+
+    void flush() override {
+      Mutex::Locker l(qlock);
+
+      while (cct->_conf->filestore_blackhole)
+	cond.Wait(qlock);  // wait forever
+
+
+      // get max for journal _or_ op queues
+      uint64_t seq = 0;
+      if (!q.empty())
+	seq = q.back()->op;
+      if (!jq.empty() && jq.back() > seq)
+	seq = jq.back();
+
+      if (seq) {
+	// everything prior to our watermark to drain through either/both queues
+	while ((!q.empty() && q.front()->op <= seq) ||
+	       (!jq.empty() && jq.front() <= seq))
+	  cond.Wait(qlock);
+      }
+    }
+    bool flush_commit(Context *c) override {
+      Mutex::Locker l(qlock);
+      uint64_t seq = 0;
+      if (_get_max_uncompleted(&seq)) {
+	return true;
+      } else {
+	flush_commit_waiters.push_back(make_pair(seq, c));
+	return false;
+      }
+    }
+
+    OpSequencer(CephContext* cct, int i, coll_t cid)
+      : CollectionImpl(cid),
+	cct(cct),
+	qlock("FileStore::OpSequencer::qlock", false, false),
+	osr_name_str(stringify(cid)),
+	apply_lock("FileStore::OpSequencer::apply_lock", false, false),
+        id(i),
+	osr_name(osr_name_str.c_str()) {}
+    ~OpSequencer() override {
+      ceph_assert(q.empty());
+    }
+  };
+  typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
+
+  Mutex coll_lock;
+  map<coll_t,OpSequencerRef> coll_map;
+
+  friend ostream& operator<<(ostream& out, const OpSequencer& s);
+
+  FDCache fdcache;
+  WBThrottle wbthrottle;
+
+  std::atomic<int64_t> next_osr_id = { 0 };
+  bool m_disable_wbthrottle;
+  deque<OpSequencer*> op_queue;
+  BackoffThrottle throttle_ops, throttle_bytes;
+  const int m_ondisk_finisher_num;
+  const int m_apply_finisher_num;
+  vector<Finisher*> ondisk_finishers;
+  vector<Finisher*> apply_finishers;
+
+  ThreadPool op_tp;
+  struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
+    FileStore *store;
+    OpWQ(FileStore *fs, time_t timeout, time_t suicide_timeout, ThreadPool *tp)
+      : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ", timeout, suicide_timeout, tp), store(fs) {}
+
+    bool _enqueue(OpSequencer *osr) override {
+      store->op_queue.push_back(osr);
+      return true;
+    }
+    void _dequeue(OpSequencer *o) override {
+      ceph_abort();
+    }
+    bool _empty() override {
+      return store->op_queue.empty();
+    }
+    OpSequencer *_dequeue() override {
+      if (store->op_queue.empty())
+	return nullptr;
+      OpSequencer *osr = store->op_queue.front();
+      store->op_queue.pop_front();
+      return osr;
+    }
+    void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override {
+      store->_do_op(osr, handle);
+    }
+    void _process_finish(OpSequencer *osr) override {
+      store->_finish_op(osr);
+    }
+    void _clear() override {
+      ceph_assert(store->op_queue.empty());
+    }
+  } op_wq;
+
+  void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
+  void _finish_op(OpSequencer *o);
+  Op *build_op(vector<Transaction>& tls,
+	       Context *onreadable, Context *onreadable_sync,
+	       TrackedOpRef osd_op);
+  void queue_op(OpSequencer *osr, Op *o);
+  void op_queue_reserve_throttle(Op *o);
+  void op_queue_release_throttle(Op *o);
+  void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
+  friend struct C_JournaledAhead;
+
+  void new_journal();
+
+  PerfCounters *logger;
+
+  ZTracer::Endpoint trace_endpoint;
+
+public:
+  int lfn_find(const ghobject_t& oid, const Index& index,
+                                  IndexedPath *path = nullptr);
+  int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length);
+  int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf);
+  int lfn_open(
+    const coll_t& cid,
+    const ghobject_t& oid,
+    bool create,
+    FDRef *outfd,
+    Index *index = nullptr);
+
+  void lfn_close(FDRef fd);
+  int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ;
+  int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos,
+		 bool force_clear_omap=false);
+
+public:
+  FileStore(CephContext* cct, const std::string &base, const std::string &jdev,
+	    osflagbits_t flags = 0,
+    const char *internal_name = "filestore", bool update_to=false);
+  ~FileStore() override;
+
+  string get_type() override {
+    return "filestore";
+  }
+
+  int _detect_fs();
+  int _sanity_check_fs();
+
+  bool test_mount_in_use() override;
+  int read_op_seq(uint64_t *seq);
+  int write_op_seq(int, uint64_t seq);
+  int mount() override;
+  int umount() override;
+
+  int validate_hobject_key(const hobject_t &obj) const override;
+
+  unsigned get_max_attr_name_length() override {
+    // xattr limit is 128; leave room for our prefixes (user.ceph._),
+    // some margin, and cap at 100
+    return 100;
+  }
+  int mkfs() override;
+  int mkjournal() override;
+  bool wants_journal() override {
+    return true;
+  }
+  bool allows_journal() override {
+    return true;
+  }
+  bool needs_journal() override {
+    return false;
+  }
+
+  bool is_sync_onreadable() const override {
+    return false;
+  }
+
+  bool is_rotational() override;
+  bool is_journal_rotational() override;
+
+  void dump_perf_counters(Formatter *f) override {
+    f->open_object_section("perf_counters");
+    logger->dump_formatted(f, false);
+    f->close_section();
+  }
+
+  int flush_cache(ostream *os = NULL) override;
+  int write_version_stamp();
+  int version_stamp_is_valid(uint32_t *version);
+  int update_version_stamp();
+  int upgrade() override;
+
+  bool can_sort_nibblewise() override {
+    return true;    // i support legacy sort order
+  }
+
+  void collect_metadata(map<string,string> *pm) override;
+  int get_devices(set<string> *ls) override;
+
+  int statfs(struct store_statfs_t *buf,
+             osd_alert_list_t* alerts = nullptr) override;
+  int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override;
+
+  int _do_transactions(
+    vector<Transaction> &tls, uint64_t op_seq,
+    ThreadPool::TPHandle *handle,
+    const char *osr_name);
+  int do_transactions(vector<Transaction> &tls, uint64_t op_seq) override {
+    return _do_transactions(tls, op_seq, nullptr, "replay");
+  }
+  void _do_transaction(
+    Transaction& t, uint64_t op_seq, int trans_num,
+    ThreadPool::TPHandle *handle, const char *osr_name);
+
+  CollectionHandle open_collection(const coll_t& c) override;
+  CollectionHandle create_new_collection(const coll_t& c) override;
+  void set_collection_commit_queue(const coll_t& cid,
+				   ContextQueue *commit_queue) override {
+  }
+
+  int queue_transactions(CollectionHandle& ch, vector<Transaction>& tls,
+			 TrackedOpRef op = TrackedOpRef(),
+			 ThreadPool::TPHandle *handle = nullptr) override;
+
+  /**
+   * set replay guard xattr on given file
+   *
+   * This will ensure that we will not replay this (or any previous) operation
+   * against this particular inode/object.
+   *
+   * @param fd open file descriptor for the file/object
+   * @param spos sequencer position of the last operation we should not replay
+   */
+  void _set_replay_guard(int fd,
+			 const SequencerPosition& spos,
+			 const ghobject_t *oid=0,
+			 bool in_progress=false);
+  void _set_replay_guard(const coll_t& cid,
+                         const SequencerPosition& spos,
+                         bool in_progress);
+  void _set_global_replay_guard(const coll_t& cid,
+				const SequencerPosition &spos);
+
+  /// close a replay guard opened with in_progress=true
+  void _close_replay_guard(int fd, const SequencerPosition& spos,
+			   const ghobject_t *oid=0);
+  void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+
+  /**
+   * check replay guard xattr on given file
+   *
+   * Check the current position against any marker on the file that
+   * indicates which operations have already been applied.  If the
+   * current or a newer operation has been marked as applied, we
+   * should not replay the current operation again.
+   *
+   * If we are not replaying the journal, we already return true.  It
+   * is only on replay that we might return false, indicated that the
+   * operation should not be performed (again).
+   *
+   * @param fd open fd on the file/object in question
+   * @param spos sequencerposition for an operation we could apply/replay
+   * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
+   */
+  int _check_replay_guard(int fd, const SequencerPosition& spos);
+  int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+  int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos);
+  int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+
+  // ------------------
+  // objects
+  int pick_object_revision_lt(ghobject_t& oid) {
+    return 0;
+  }
+  using ObjectStore::exists;
+  bool exists(CollectionHandle& c, const ghobject_t& oid) override;
+  using ObjectStore::stat;
+  int stat(
+    CollectionHandle& c,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio = false) override;
+  using ObjectStore::set_collection_opts;
+  int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) override;
+  using ObjectStore::read;
+  int read(
+    CollectionHandle& c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    bufferlist& bl,
+    uint32_t op_flags = 0) override;
+  int _do_fiemap(int fd, uint64_t offset, size_t len,
+                 map<uint64_t, uint64_t> *m);
+  int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
+                         map<uint64_t, uint64_t> *m);
+  using ObjectStore::fiemap;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) override;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
+
+  int _touch(const coll_t& cid, const ghobject_t& oid);
+  int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
+	      const bufferlist& bl, uint32_t fadvise_flags = 0);
+  int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
+  int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+  int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+	     const SequencerPosition& spos);
+  int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
+		   uint64_t srcoff, uint64_t len, uint64_t dstoff,
+		   const SequencerPosition& spos);
+  int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+  int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+  int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
+  int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos);
+
+  int _fgetattr(int fd, const char *name, bufferptr& bp);
+  int _fgetattrs(int fd, map<string,bufferptr>& aset);
+  int _fsetattrs(int fd, map<string, bufferptr> &aset);
+
+  void do_force_sync();
+  void start_sync(Context *onsafe);
+  void sync();
+  void _flush_op_queue();
+  void flush();
+  void sync_and_flush();
+
+  int flush_journal() override;
+  int dump_journal(ostream& out) override;
+
+  void set_fsid(uuid_d u) override {
+    fsid = u;
+  }
+  uuid_d get_fsid() override { return fsid; }
+  
+  uint64_t estimate_objects_overhead(uint64_t num_objects) override;
+
+  // DEBUG read error injection, an object is removed from both on delete()
+  Mutex read_error_lock;
+  set<ghobject_t> data_error_set; // read() will return -EIO
+  set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
+  void inject_data_error(const ghobject_t &oid) override;
+  void inject_mdata_error(const ghobject_t &oid) override;
+
+  void compact() override {
+    ceph_assert(object_map);
+    object_map->compact();
+  }
+
+  bool has_builtin_csum() const override {
+    return false;
+  }
+
+  void debug_obj_on_delete(const ghobject_t &oid);
+  bool debug_data_eio(const ghobject_t &oid);
+  bool debug_mdata_eio(const ghobject_t &oid);
+
+  int snapshot(const string& name) override;
+
+  // attrs
+  using ObjectStore::getattr;
+  using ObjectStore::getattrs;
+  int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, bufferptr &bp) override;
+  int getattrs(CollectionHandle& c, const ghobject_t& oid, map<string,bufferptr>& aset) override;
+
+  int _setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset,
+		const SequencerPosition &spos);
+  int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
+	      const SequencerPosition &spos);
+  int _rmattrs(const coll_t& cid, const ghobject_t& oid,
+	       const SequencerPosition &spos);
+
+  int _collection_remove_recursive(const coll_t &cid,
+				   const SequencerPosition &spos);
+
+  int _collection_set_bits(const coll_t& cid, int bits);
+
+  // collections
+  using ObjectStore::collection_list;
+  int collection_bits(CollectionHandle& c) override;
+  int collection_list(CollectionHandle& c,
+		      const ghobject_t& start, const ghobject_t& end, int max,
+		      vector<ghobject_t> *ls, ghobject_t *next) override {
+    c->flush();
+    return collection_list(c->cid, start, end, max, ls, next);
+  }
+  int collection_list(const coll_t& cid,
+		      const ghobject_t& start, const ghobject_t& end, int max,
+		      vector<ghobject_t> *ls, ghobject_t *next);
+  int list_collections(vector<coll_t>& ls) override;
+  int list_collections(vector<coll_t>& ls, bool include_temp);
+  int collection_stat(const coll_t& c, struct stat *st);
+  bool collection_exists(const coll_t& c) override;
+  int collection_empty(CollectionHandle& c, bool *empty) override {
+    c->flush();
+    return collection_empty(c->cid, empty);
+  }
+  int collection_empty(const coll_t& cid, bool *empty);
+
+  // omap (see ObjectStore.h for documentation)
+  using ObjectStore::omap_get;
+  int omap_get(CollectionHandle& c, const ghobject_t &oid, bufferlist *header,
+	       map<string, bufferlist> *out) override;
+  using ObjectStore::omap_get_header;
+  int omap_get_header(
+    CollectionHandle& c,
+    const ghobject_t &oid,
+    bufferlist *out,
+    bool allow_eio = false) override;
+  using ObjectStore::omap_get_keys;
+  int omap_get_keys(CollectionHandle& c, const ghobject_t &oid, set<string> *keys) override;
+  using ObjectStore::omap_get_values;
+  int omap_get_values(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys,
+		      map<string, bufferlist> *out) override;
+  using ObjectStore::omap_check_keys;
+  int omap_check_keys(CollectionHandle& c, const ghobject_t &oid, const set<string> &keys,
+		      set<string> *out) override;
+  using ObjectStore::get_omap_iterator;
+  ObjectMap::ObjectMapIterator get_omap_iterator(CollectionHandle& c, const ghobject_t &oid) override;
+  ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& cid, const ghobject_t &oid);
+
+  int _create_collection(const coll_t& c, int bits,
+			 const SequencerPosition &spos);
+  int _destroy_collection(const coll_t& c);
+  /**
+   * Give an expected number of objects hint to the collection.
+   *
+   * @param c                 - collection id.
+   * @param pg_num            - pg number of the pool this collection belongs to
+   * @param expected_num_objs - expected number of objects in this collection
+   * @param spos              - sequence position
+   *
+   * @return 0 on success, an error code otherwise
+   */
+  int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
+      uint64_t expected_num_objs,
+      const SequencerPosition &spos);
+  int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid,
+		      const SequencerPosition& spos);
+  int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+			      coll_t c, const ghobject_t& o,
+			      const SequencerPosition& spos,
+			      bool ignore_enoent = false);
+
+  int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
+                      uint64_t expected_object_size,
+                      uint64_t expected_write_size);
+
+  void dump_start(const std::string& file);
+  void dump_stop();
+  void dump_transactions(vector<Transaction>& ls, uint64_t seq, OpSequencer *osr);
+
+  virtual int apply_layout_settings(const coll_t &cid, int target_level);
+
+  void get_db_statistics(Formatter* f) override;
+
+private:
+  void _inject_failure();
+
+  // omap
+  int _omap_clear(const coll_t& cid, const ghobject_t &oid,
+		  const SequencerPosition &spos);
+  int _omap_setkeys(const coll_t& cid, const ghobject_t &oid,
+		    const map<string, bufferlist> &aset,
+		    const SequencerPosition &spos);
+  int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const set<string> &keys,
+		   const SequencerPosition &spos);
+  int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+		       const string& first, const string& last,
+		       const SequencerPosition &spos);
+  int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const bufferlist &bl,
+		      const SequencerPosition &spos);
+  int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest,
+                        const SequencerPosition &spos);
+  int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest,
+                        const SequencerPosition &spos);
+
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override;
+  int set_throttle_params();
+  float m_filestore_commit_timeout;
+  bool m_filestore_journal_parallel;
+  bool m_filestore_journal_trailing;
+  bool m_filestore_journal_writeahead;
+  int m_filestore_fiemap_threshold;
+  double m_filestore_max_sync_interval;
+  double m_filestore_min_sync_interval;
+  bool m_filestore_fail_eio;
+  bool m_filestore_fadvise;
+  int do_update;
+  bool m_journal_dio, m_journal_aio, m_journal_force_aio;
+  std::string m_osd_rollback_to_cluster_snap;
+  bool m_osd_use_stale_snap;
+  bool m_filestore_do_dump;
+  std::ofstream m_filestore_dump;
+  JSONFormatter m_filestore_dump_fmt;
+  std::atomic<int64_t> m_filestore_kill_at = { 0 };
+  bool m_filestore_sloppy_crc;
+  int m_filestore_sloppy_crc_block_size;
+  uint64_t m_filestore_max_alloc_hint_size;
+  unsigned long m_fs_type;
+
+  //Determined xattr handling based on fs type
+  void set_xattr_limits_via_conf();
+  uint32_t m_filestore_max_inline_xattr_size;
+  uint32_t m_filestore_max_inline_xattrs;
+  uint32_t m_filestore_max_xattr_value_size;
+
+  FSSuperblock superblock;
+
+  /**
+   * write_superblock()
+   *
+   * Write superblock to persisent storage
+   *
+   * return value: 0 on success, otherwise negative errno
+   */
+  int write_superblock();
+
+  /**
+   * read_superblock()
+   *
+   * Fill in FileStore::superblock by reading persistent storage
+   *
+   * return value: 0 on success, otherwise negative errno
+   */
+  int read_superblock();
+
+  friend class FileStoreBackend;
+  friend class TestFileStore;
+};
+
+ostream& operator<<(ostream& out, const FileStore::OpSequencer& s);
+
+struct fiemap;
+
+class FileStoreBackend {
+private:
+  FileStore *filestore;
+protected:
+  int get_basedir_fd() {
+    return filestore->basedir_fd;
+  }
+  int get_current_fd() {
+    return filestore->current_fd;
+  }
+  int get_op_fd() {
+    return filestore->op_fd;
+  }
+  size_t get_blksize() {
+    return filestore->blk_size;
+  }
+  const string& get_basedir_path() {
+    return filestore->basedir;
+  }
+  const string& get_journal_path() {
+    return filestore->journalpath;
+  }
+  const string& get_current_path() {
+    return filestore->current_fn;
+  }
+  int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
+    if (has_fiemap() || has_seek_data_hole()) {
+      return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
+    } else {
+      return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
+    }
+  }
+  int get_crc_block_size() {
+    return filestore->m_filestore_sloppy_crc_block_size;
+  }
+
+public:
+  explicit FileStoreBackend(FileStore *fs) : filestore(fs) {}
+  virtual ~FileStoreBackend() {}
+
+  CephContext* cct() const {
+    return filestore->cct;
+  }
+
+  static FileStoreBackend *create(unsigned long f_type, FileStore *fs);
+
+  virtual const char *get_name() = 0;
+  virtual int detect_features() = 0;
+  virtual int create_current() = 0;
+  virtual bool can_checkpoint() = 0;
+  virtual int list_checkpoints(list<string>& ls) = 0;
+  virtual int create_checkpoint(const string& name, uint64_t *cid) = 0;
+  virtual int sync_checkpoint(uint64_t id) = 0;
+  virtual int rollback_to(const string& name) = 0;
+  virtual int destroy_checkpoint(const string& name) = 0;
+  virtual int syncfs() = 0;
+  virtual bool has_fiemap() = 0;
+  virtual bool has_seek_data_hole() = 0;
+  virtual bool is_rotational() = 0;
+  virtual bool is_journal_rotational() = 0;
+  virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
+  virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
+  virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
+  virtual bool has_splice() const = 0;
+
+  // hooks for (sloppy) crc tracking
+  virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
+  virtual int _crc_update_truncate(int fd, loff_t off) = 0;
+  virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
+  virtual int _crc_update_clone_range(int srcfd, int destfd,
+				      loff_t srcoff, size_t len, loff_t dstoff) = 0;
+  virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+			       ostream *out) = 0;
+};
+
+#endif
diff --git a/src/os/filestore/GenericFileStoreBackend.cc b/src/os/filestore/GenericFileStoreBackend.cc
new file mode 100644
index 00000000..a75d501f
--- /dev/null
+++ b/src/os/filestore/GenericFileStoreBackend.cc
@@ -0,0 +1,468 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "GenericFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+#include "common/blkdev.h"
+
+#include "common/SloppyCRCMap.h"
+#include "os/filestore/chain_xattr.h"
+
+#define SLOPPY_CRC_XATTR "user.cephos.scrc"
+
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
+  FileStoreBackend(fs),
+  ioctl_fiemap(false),
+  seek_data_hole(false),
+  use_splice(false),
+  m_filestore_fiemap(cct()->_conf->filestore_fiemap),
+  m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole),
+  m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data),
+  m_filestore_splice(cct()->_conf->filestore_splice)
+{
+  // rotational?
+  {
+    // NOTE: the below won't work on btrfs; we'll assume rotational.
+    string fn = get_basedir_path();
+    int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0) {
+      return;
+    }
+    BlkDev blkdev(fd);
+    m_rotational = blkdev.is_rotational();
+    dout(20) << __func__ << " basedir " << fn
+	     << " rotational " << (int)m_rotational << dendl;
+    ::close(fd);
+  }
+  // journal rotational?
+  {
+    // NOTE: the below won't work on btrfs; we'll assume rotational.
+    string fn = get_journal_path();
+    int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0) {
+      return;
+    }
+    BlkDev blkdev(fd);
+    m_journal_rotational = blkdev.is_rotational();
+    dout(20) << __func__ << " journal filename " << fn.c_str()
+	     << " journal rotational " << (int)m_journal_rotational << dendl;
+    ::close(fd);
+  }
+}
+
+int GenericFileStoreBackend::detect_features()
+{
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
+
+  int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, 0644);
+  if (fd < 0) {
+    fd = -errno;
+    derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
+    return fd;
+  }
+
+  // ext4 has a bug in older kernels where fiemap will return an empty
+  // result in some cases.  this is a file layout that triggers the bug
+  // on 2.6.34-rc5.
+  int v[] = {
+    0x0000000000016000, 0x0000000000007000,
+    0x000000000004a000, 0x0000000000007000,
+    0x0000000000060000, 0x0000000000001000,
+    0x0000000000061000, 0x0000000000008000,
+    0x0000000000069000, 0x0000000000007000,
+    0x00000000000a3000, 0x000000000000c000,
+    0x000000000024e000, 0x000000000000c000,
+    0x000000000028b000, 0x0000000000009000,
+    0x00000000002b1000, 0x0000000000003000,
+    0, 0
+  };
+  for (int i=0; v[i]; i++) {
+    int off = v[i++];
+    int len = v[i];
+
+    // write a large extent
+    char buf[len];
+    memset(buf, 1, sizeof(buf));
+    int r = ::lseek(fd, off, SEEK_SET);
+    if (r < 0) {
+      r = -errno;
+      derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+      return r;
+    }
+    r = write(fd, buf, sizeof(buf));
+    if (r < 0) {
+      derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+      return r;
+    }
+  }
+
+  // fiemap an extent inside that
+  if (!m_filestore_fiemap) {
+    dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
+    ioctl_fiemap = false;
+  } else {
+    struct fiemap *fiemap;
+    int r = do_fiemap(fd, 2430421, 59284, &fiemap);
+    if (r < 0) {
+      dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
+      ioctl_fiemap = false;
+    } else {
+      if (fiemap->fm_mapped_extents == 0) {
+        dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
+        ioctl_fiemap = false;
+      } else {
+        dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
+        ioctl_fiemap = true;
+      }
+      free(fiemap);
+    }
+  }
+
+  // SEEK_DATA/SEEK_HOLE detection
+  if (!m_filestore_seek_data_hole) {
+    dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
+    seek_data_hole = false;
+  } else {
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+    // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
+    // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
+    // Fall back to use fiemap.
+    off_t hole_pos;
+
+    hole_pos = lseek(fd, 0, SEEK_HOLE);
+    if (hole_pos < 0) {
+      if (errno == EINVAL) {
+        dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
+        seek_data_hole = false;
+      } else {
+        derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
+        VOID_TEMP_FAILURE_RETRY(::close(fd));
+        return -errno;
+      }
+    } else {
+      dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
+      seek_data_hole = true;
+    }
+#endif
+  }
+
+  //splice detection
+#ifdef CEPH_HAVE_SPLICE
+  if (!m_filestore_splice) {
+    dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl;
+    use_splice = false;
+  } else {
+    int pipefd[2];
+    loff_t off_in = 0;
+    int r;
+    if (pipe_cloexec(pipefd) < 0) {
+      int e = errno;
+      dout(0) << "detect_features: splice pipe met error " << cpp_strerror(e) << dendl;
+    } else {
+      lseek(fd, 0, SEEK_SET);
+      r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
+      if (!(r < 0 && errno == EINVAL)) {
+	use_splice = true;
+	dout(0) << "detect_features: splice is supported" << dendl;
+      } else
+	dout(0) << "detect_features: splice is NOT supported" << dendl;
+      close(pipefd[0]);
+      close(pipefd[1]);
+    }
+  }
+#endif
+  ::unlink(fn);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+
+  bool have_syncfs = false;
+#ifdef HAVE_SYS_SYNCFS
+  if (::syncfs(get_basedir_fd()) == 0) {
+    dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
+    have_syncfs = true;
+  } else {
+    dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
+  }
+#elif defined(SYS_syncfs)
+  if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
+    dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
+    have_syncfs = true;
+  } else {
+    dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+  }
+#elif defined(__NR_syncfs)
+  if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
+    dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
+    have_syncfs = true;
+  } else {
+    dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+  }
+#endif
+  if (!have_syncfs) {
+    dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
+    if (m_filestore_fsync_flushes_journal_data) {
+      dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
+    } else {
+      dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
+      dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
+    }
+  }
+
+  return 0;
+}
+
+int GenericFileStoreBackend::create_current()
+{
+  struct stat st;
+  int ret = ::stat(get_current_path().c_str(), &st);
+  if (ret == 0) {
+    // current/ exists
+    if (!S_ISDIR(st.st_mode)) {
+      dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
+      ret = -EINVAL;
+    }
+  } else {
+    ret = ::mkdir(get_current_path().c_str(), 0755);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
+    }
+  }
+  return ret;
+}
+
+int GenericFileStoreBackend::syncfs()
+{
+  int ret;
+  if (m_filestore_fsync_flushes_journal_data) {
+    dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
+    // make the file system's journal commit.
+    //  this works with ext3, but NOT ext4
+    ret = ::fsync(get_op_fd());
+    if (ret < 0)
+      ret = -errno;
+  } else {
+    dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
+    ret = sync_filesystem(get_current_fd());
+  }
+  return ret;
+}
+
+int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
+{
+  struct fiemap *fiemap = NULL;
+  struct fiemap *_realloc_fiemap = NULL;
+  int size;
+  int ret;
+
+  fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
+  if (!fiemap)
+    return -ENOMEM;
+  /*
+   * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
+   * the result is (logical=4096, len=4096). It leak the [3990, 4096).
+   * Commit:"xfs: fix rounding error of fiemap length parameter
+   * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
+   * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
+   */
+  fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
+  fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
+  fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+  ret = -ENOTSUP;
+  goto done_err;
+#else
+  if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+    ret = -errno;
+    goto done_err;
+  }
+#endif
+  size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
+
+  _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
+  if (!_realloc_fiemap) {
+    ret = -ENOMEM;
+    goto done_err;
+  } else {
+    fiemap = _realloc_fiemap;
+  }
+
+  memset(fiemap->fm_extents, 0, size);
+
+  fiemap->fm_extent_count = fiemap->fm_mapped_extents;
+  fiemap->fm_mapped_extents = 0;
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+  ret = -ENOTSUP;
+  goto done_err;
+#else
+  if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+    ret = -errno;
+    goto done_err;
+  }
+  *pfiemap = fiemap;
+#endif
+  return 0;
+
+done_err:
+  *pfiemap = NULL;
+  free(fiemap);
+  return ret;
+}
+
+
+int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
+{
+  char buf[100];
+  bufferptr bp;
+  int r = 0;
+  int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
+  if (l == -ENODATA) {
+    return 0;
+  }
+  if (l >= 0) {
+    bp = buffer::create(l);
+    memcpy(bp.c_str(), buf, l);
+  } else if (l == -ERANGE) {
+    l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
+    if (l > 0) {
+      bp = buffer::create(l);
+      l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
+    }
+  }
+  bufferlist bl;
+  bl.append(std::move(bp));
+  auto p = bl.cbegin();
+  try {
+    decode(*cm, p);
+  }
+  catch (buffer::error &e) {
+    r = -EIO;
+  }
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
+{
+  bufferlist bl;
+  encode(*cm, bl);
+  int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  ostringstream ss;
+  scm.write(off, len, bl, &ss);
+  dout(30) << __func__ << "\n" << ss.str() << dendl;
+  r = _crc_save(fd, &scm);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  scm.truncate(off);
+  r = _crc_save(fd, &scm);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  scm.zero(off, len);
+  r = _crc_save(fd, &scm);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
+						     loff_t srcoff, size_t len, loff_t dstoff)
+{
+  SloppyCRCMap scm_src(get_crc_block_size());
+  SloppyCRCMap scm_dst(get_crc_block_size());
+  int r = _crc_load_or_init(srcfd, &scm_src);
+  if (r < 0)
+    return r;
+  r = _crc_load_or_init(destfd, &scm_dst);
+  if (r < 0)
+    return r;
+  ostringstream ss;
+  scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
+  dout(30) << __func__ << "\n" << ss.str() << dendl;
+  r = _crc_save(destfd, &scm_dst);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+					      ostream *out)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  return scm.read(off, len, bl, out);
+}
diff --git a/src/os/filestore/GenericFileStoreBackend.h b/src/os/filestore/GenericFileStoreBackend.h
new file mode 100644
index 00000000..207c3d0d
--- /dev/null
+++ b/src/os/filestore/GenericFileStoreBackend.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_GENERICFILESTOREBACKEDN_H
+#define CEPH_GENERICFILESTOREBACKEDN_H
+
+#include "FileStore.h"
+
+class SloppyCRCMap;
+
+class GenericFileStoreBackend : public FileStoreBackend {
+private:
+  bool ioctl_fiemap;
+  bool seek_data_hole;
+  bool use_splice;
+  bool m_filestore_fiemap;
+  bool m_filestore_seek_data_hole;
+  bool m_filestore_fsync_flushes_journal_data;
+  bool m_filestore_splice;
+  bool m_rotational = true;
+  bool m_journal_rotational = true;
+public:
+  explicit GenericFileStoreBackend(FileStore *fs);
+  ~GenericFileStoreBackend() override {}
+
+  const char *get_name() override {
+    return "generic";
+  }
+  int detect_features() override;
+  int create_current() override;
+  bool can_checkpoint() override { return false; }
+  bool is_rotational() override {
+    return m_rotational;
+  }
+  bool is_journal_rotational() override {
+    return m_journal_rotational;
+  }
+  int list_checkpoints(list<string>& ls) override { return 0; }
+  int create_checkpoint(const string& name, uint64_t *cid) override { return -EOPNOTSUPP; }
+  int sync_checkpoint(uint64_t id) override { return -EOPNOTSUPP; }
+  int rollback_to(const string& name) override { return -EOPNOTSUPP; }
+  int destroy_checkpoint(const string& name) override { return -EOPNOTSUPP; }
+  int syncfs() override;
+  bool has_fiemap() override { return ioctl_fiemap; }
+  bool has_seek_data_hole() override { return seek_data_hole; }
+  int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) override;
+  int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override {
+    return _copy_range(from, to, srcoff, len, dstoff);
+  }
+  int set_alloc_hint(int fd, uint64_t hint) override { return -EOPNOTSUPP; }
+  bool has_splice() const override { return use_splice; }
+private:
+  int _crc_load_or_init(int fd, SloppyCRCMap *cm);
+  int _crc_save(int fd, SloppyCRCMap *cm);
+public:
+  int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) override;
+  int _crc_update_truncate(int fd, loff_t off) override;
+  int _crc_update_zero(int fd, loff_t off, size_t len) override;
+  int _crc_update_clone_range(int srcfd, int destfd,
+				      loff_t srcoff, size_t len, loff_t dstoff) override;
+  int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+			       ostream *out) override;
+};
+#endif
diff --git a/src/os/filestore/HashIndex.cc b/src/os/filestore/HashIndex.cc
new file mode 100644
index 00000000..ab56b43c
--- /dev/null
+++ b/src/os/filestore/HashIndex.cc
@@ -0,0 +1,1195 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+#include <errno.h>
+
+#include "HashIndex.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+
+const string HashIndex::SUBDIR_ATTR = "contents";
+const string HashIndex::SETTINGS_ATTR = "settings";
+const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
+
+/// hex digit to integer value
+int hex_to_int(char c)
+{
+  if (c >= '0' && c <= '9')
+    return c - '0';
+  if (c >= 'A' && c <= 'F')
+    return c - 'A' + 10;
+  ceph_abort();
+}
+
+/// int value to hex digit
+char int_to_hex(int v)
+{
+  ceph_assert(v < 16);
+  if (v < 10)
+    return '0' + v;
+  return 'A' + v - 10;
+}
+
+/// reverse bits in a nibble (0..15)
+int reverse_nibble_bits(int in)
+{
+  ceph_assert(in < 16);
+  return
+    ((in & 8) >> 3) |
+    ((in & 4) >> 1) |
+    ((in & 2) << 1) |
+    ((in & 1) << 3);
+}
+
+/// reverse nibble bits in a hex digit
+char reverse_hexdigit_bits(char c)
+{
+  return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
+}
+
+/// reverse nibble bits in a hex string
+string reverse_hexdigit_bits_string(string s)
+{
+  for (unsigned i=0; i<s.size(); ++i)
+    s[i] = reverse_hexdigit_bits(s[i]);
+  return s;
+}
+
+/// compare hex digit (as length 1 string) bitwise
+bool cmp_hexdigit_bitwise(const string& l, const string& r)
+{
+  ceph_assert(l.length() == 1 && r.length() == 1);
+  int lv = hex_to_int(l[0]);
+  int rv = hex_to_int(r[0]);
+  ceph_assert(lv < 16);
+  ceph_assert(rv < 16);
+  return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
+}
+
+/// compare hex digit string bitwise
+bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
+{
+  string ll = reverse_hexdigit_bits_string(l);
+  string rr = reverse_hexdigit_bits_string(r);
+  return ll < rr;
+}
+
+int HashIndex::cleanup() {
+  bufferlist bl;
+  int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0) {
+    // No in progress operations!
+    return 0;
+  }
+  auto i = bl.cbegin();
+  InProgressOp in_progress(i);
+  subdir_info_s info;
+  r = get_info(in_progress.path, &info);
+  if (r == -ENOENT) {
+    return end_split_or_merge(in_progress.path);
+  } else if (r < 0) {
+    return r;
+  }
+
+  if (in_progress.is_split())
+    return complete_split(in_progress.path, info);
+  else if (in_progress.is_merge())
+    return complete_merge(in_progress.path, info);
+  else if (in_progress.is_col_split()) {
+    for (vector<string>::iterator i = in_progress.path.begin();
+	 i != in_progress.path.end();
+	 ++i) {
+      vector<string> path(in_progress.path.begin(), i);
+      int r = reset_attr(path);
+      if (r < 0)
+	return r;
+    }
+    return 0;
+  }
+  else
+    return -EINVAL;
+}
+
+int HashIndex::reset_attr(
+  const vector<string> &path)
+{
+  int exists = 0;
+  int r = path_exists(path, &exists);
+  if (r < 0)
+    return r;
+  if (!exists)
+    return 0;
+  map<string, ghobject_t> objects;
+  vector<string> subdirs;
+  r = list_objects(path, 0, 0, &objects);
+  if (r < 0)
+    return r;
+  r = list_subdirs(path, &subdirs);
+  if (r < 0)
+    return r;
+
+  subdir_info_s info;
+  info.hash_level = path.size();
+  info.objs = objects.size();
+  info.subdirs = subdirs.size();
+  return set_info(path, info);
+}
+
+int HashIndex::col_split_level(
+  HashIndex &from,
+  HashIndex &to,
+  const vector<string> &path,
+  uint32_t inbits,
+  uint32_t match,
+  unsigned *mkdirred)
+{
+  /* For each subdir, move, recurse, or ignore based on comparing the low order
+   * bits of the hash represented by the subdir path with inbits, match passed
+   * in.
+   */
+  vector<string> subdirs;
+  int r = from.list_subdirs(path, &subdirs);
+  if (r < 0)
+    return r;
+  map<string, ghobject_t> objects;
+  r = from.list_objects(path, 0, 0, &objects);
+  if (r < 0)
+    return r;
+
+  set<string> to_move;
+  for (vector<string>::iterator i = subdirs.begin();
+       i != subdirs.end();
+       ++i) {
+    uint32_t bits = 0;
+    uint32_t hash = 0;
+    vector<string> sub_path(path.begin(), path.end());
+    sub_path.push_back(*i);
+    path_to_hobject_hash_prefix(sub_path, &bits, &hash);
+    if (bits < inbits) {
+      if (hobject_t::match_hash(hash, bits, match)) {
+	r = col_split_level(
+	  from,
+	  to,
+	  sub_path,
+	  inbits,
+	  match,
+	  mkdirred);
+	if (r < 0)
+	  return r;
+	if (*mkdirred > path.size())
+	  *mkdirred = path.size();
+      } // else, skip, doesn't need to be moved or recursed into
+    } else {
+      if (hobject_t::match_hash(hash, inbits, match)) {
+	to_move.insert(*i);
+      }
+    } // else, skip, doesn't need to be moved or recursed into
+  }
+
+  /* Then, do the same for each object */
+  map<string, ghobject_t> objs_to_move;
+  for (map<string, ghobject_t>::iterator i = objects.begin();
+       i != objects.end();
+       ++i) {
+    if (i->second.match(inbits, match)) {
+      objs_to_move.insert(*i);
+    }
+  }
+
+  if (objs_to_move.empty() && to_move.empty())
+    return 0;
+
+  // Make parent directories as needed
+  while (*mkdirred < path.size()) {
+    ++*mkdirred;
+    int exists = 0;
+    vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
+    r = to.path_exists(creating_path, &exists);
+    if (r < 0)
+      return r;
+    if (exists)
+      continue;
+    subdir_info_s info;
+    info.objs = 0;
+    info.subdirs = 0;
+    info.hash_level = creating_path.size();
+    if (*mkdirred < path.size() - 1)
+      info.subdirs = 1;
+    r = to.start_col_split(creating_path);
+    if (r < 0)
+      return r;
+    r = to.create_path(creating_path);
+    if (r < 0)
+      return r;
+    r = to.set_info(creating_path, info);
+    if (r < 0)
+      return r;
+    r = to.end_split_or_merge(creating_path);
+    if (r < 0)
+      return r;
+  }
+
+  subdir_info_s from_info;
+  subdir_info_s to_info;
+  r = from.get_info(path, &from_info);
+  if (r < 0)
+    return r;
+  r = to.get_info(path, &to_info);
+  if (r < 0)
+    return r;
+
+  from.start_col_split(path);
+  to.start_col_split(path);
+
+  // Do subdir moves
+  for (set<string>::iterator i = to_move.begin();
+       i != to_move.end();
+       ++i) {
+    from_info.subdirs--;
+    to_info.subdirs++;
+    r = move_subdir(from, to, path, *i);
+    if (r < 0)
+      return r;
+  }
+
+  for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
+       i != objs_to_move.end();
+       ++i) {
+    from_info.objs--;
+    to_info.objs++;
+    r = move_object(from, to, path, *i);
+    if (r < 0)
+      return r;
+  }
+
+
+  r = to.set_info(path, to_info);
+  if (r < 0)
+    return r;
+  r = from.set_info(path, from_info);
+  if (r < 0)
+    return r;
+  from.end_split_or_merge(path);
+  to.end_split_or_merge(path);
+  return 0;
+}
+
+int HashIndex::_merge(
+  uint32_t bits,
+  CollectionIndex* dest) {
+  dout(20) << __func__ << " bits " << bits << dendl;
+  ceph_assert(collection_version() == dest->collection_version());
+
+  vector<string> emptypath;
+
+  // pre-split to common/target level so that any shared prefix DIR_?
+  // directories already exist at the destination.  Since each
+  // directory is a nibble (4 bits),
+  unsigned shared = bits / 4;
+  dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl;
+  if (shared) {
+    split_dirs(emptypath, shared);
+    ((HashIndex*)dest)->split_dirs(emptypath, shared);
+  }
+
+  // now merge the contents
+  _merge_dirs(*this, *(HashIndex*)dest, emptypath);
+
+  return 0;
+}
+
+int HashIndex::_merge_dirs(
+  HashIndex& from,
+  HashIndex& to,
+  const vector<string>& path)
+{
+  dout(20) << __func__ << " path " << path << dendl;
+  int r;
+
+  vector<string> src_subs, dst_subs;
+  r = from.list_subdirs(path, &src_subs);
+  if (r < 0) {
+    lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						  << " r " << r << " from "
+						  << "from.list_subdirs"
+						  << dendl;
+    return r;
+  }
+  r = to.list_subdirs(path, &dst_subs);
+  if (r < 0) {
+    lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						  << " r " << r << " from "
+						  << "to.list_subdirs"
+						  << dendl;
+    return r;
+  }
+
+  for (auto& i : src_subs) {
+    if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) {
+      // move it
+      r = move_subdir(from, to, path, i);
+      if (r < 0) {
+	lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						      << " r " << r << " from "
+						      << "move_subdir(...,"
+						      << path << "," << i << ")"
+						      << dendl;
+	return r;
+      }
+    } else {
+      // common, recurse!
+      vector<string> nested = path;
+      nested.push_back(i);
+      r = _merge_dirs(from, to, nested);
+      if (r < 0) {
+	lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						      << " r " << r << " from "
+						      << "rec _merge_dirs"
+						      << dendl;
+	return r;
+      }
+
+      // now remove it
+      r = remove_path(nested);
+      if (r < 0) {
+	lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						      << " r " << r << " from "
+						      << "remove_path "
+						      << nested
+						      << dendl;
+	return r;
+      }
+    }
+  }
+
+  // objects
+  map<string, ghobject_t> objects;
+  r = from.list_objects(path, 0, 0, &objects);
+  if (r < 0) {
+    lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						  << " r " << r << " from "
+						  << "from.list_objects"
+						  << dendl;
+    return r;
+  }
+
+  for (auto& i : objects) {
+    r = move_object(from, to, path, i);
+    if (r < 0) {
+      lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						    << " r " << r << " from "
+						    << "move_object(...,"
+						    << path << "," << i << ")"
+						    << dendl;
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+
+int HashIndex::_split(
+  uint32_t match,
+  uint32_t bits,
+  CollectionIndex* dest) {
+  ceph_assert(collection_version() == dest->collection_version());
+  unsigned mkdirred = 0;
+
+  return col_split_level(
+    *this,
+    *static_cast<HashIndex*>(dest),
+    vector<string>(),
+    bits,
+    match,
+    &mkdirred);
+}
+
+int HashIndex::split_dirs(const vector<string> &path, int target_level) {
+  dout(20) << __func__ << " " << path << " target level: " 
+           << target_level << dendl;
+  subdir_info_s info;
+  int r = get_info(path, &info);
+  if (r < 0) {
+    dout(10) << "error looking up info for " << path << ": "
+	     << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (must_split(info, target_level)) {
+    dout(1) << __func__ << " " << path << " has " << info.objs
+            << " objects, " << info.hash_level 
+            << " level, starting split in pg " << coll() << "." << dendl;
+    r = initiate_split(path, info);
+    if (r < 0) {
+      dout(10) << "error initiating split on " << path << ": "
+	       << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    r = complete_split(path, info);
+    dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
+            << dendl;
+    if (r < 0) {
+      dout(10) << "error completing split on " << path << ": "
+	       << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  vector<string> subdirs;
+  r = list_subdirs(path, &subdirs);
+  if (r < 0) {
+    dout(10) << "error listing subdirs of " << path << ": "
+	     << cpp_strerror(r) << dendl;
+    return r;
+  }
+  for (vector<string>::const_iterator it = subdirs.begin();
+       it != subdirs.end(); ++it) {
+    vector<string> subdir_path(path);
+    subdir_path.push_back(*it);
+    r = split_dirs(subdir_path, target_level);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  return r;
+}
+
+int HashIndex::apply_layout_settings(int target_level) {
+  vector<string> path;
+  dout(10) << __func__ << " split multiple = " << split_multiplier
+	   << " merge threshold = " << merge_threshold
+	   << " split rand factor = " << cct->_conf->filestore_split_rand_factor
+	   << " target level = " << target_level
+	   << dendl;
+  int r = write_settings();
+  if (r < 0)
+    return r;
+  return split_dirs(path, target_level);
+}
+
+int HashIndex::_init() {
+  subdir_info_s info;
+  vector<string> path;
+  int r = set_info(path, info);
+  if (r < 0)
+    return r;
+  return write_settings();
+}
+
+int HashIndex::write_settings() {
+  if (cct->_conf->filestore_split_rand_factor > 0) {
+    settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
+  } else {
+    settings.split_rand_factor = 0;
+  }
+  vector<string> path;
+  bufferlist bl;
+  settings.encode(bl);
+  return add_attr_path(path, SETTINGS_ATTR, bl);
+}
+
+int HashIndex::read_settings() {
+  vector<string> path;
+  bufferlist bl;
+  int r = get_attr_path(path, SETTINGS_ATTR, bl);
+  if (r == -ENODATA)
+    return 0;
+  if (r < 0) {
+    derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  auto it = bl.cbegin();
+  settings.decode(it);
+  dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
+  return 0;
+}
+
+/* LFNIndex virtual method implementations */
+int HashIndex::_created(const vector<string> &path,
+			const ghobject_t &oid,
+			const string &mangled_name) {
+  subdir_info_s info;
+  int r;
+  r = get_info(path, &info);
+  if (r < 0)
+    return r;
+  info.objs++;
+  r = set_info(path, info);
+  if (r < 0)
+    return r;
+
+  if (must_split(info)) {
+    dout(1) << __func__ << " " << path << " has " << info.objs
+            << " objects, starting split in pg " << coll() << "." << dendl;
+    int r = initiate_split(path, info);
+    if (r < 0)
+      return r;
+    r = complete_split(path, info);
+    dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
+            << dendl;
+    return r;
+  } else {
+    return 0;
+  }
+}
+
+int HashIndex::_remove(const vector<string> &path,
+		       const ghobject_t &oid,
+		       const string &mangled_name) {
+  int r;
+  r = remove_object(path, oid);
+  if (r < 0)
+    return r;
+  subdir_info_s info;
+  r = get_info(path, &info);
+  if (r < 0)
+    return r;
+  info.objs--;
+  r = set_info(path, info);
+  if (r < 0)
+    return r;
+  if (must_merge(info)) {
+    r = initiate_merge(path, info);
+    if (r < 0)
+      return r;
+    return complete_merge(path, info);
+  } else {
+    return 0;
+  }
+}
+
+int HashIndex::_lookup(const ghobject_t &oid,
+		       vector<string> *path,
+		       string *mangled_name,
+		       int *hardlink) {
+  vector<string> path_comp;
+  get_path_components(oid, &path_comp);
+  vector<string>::iterator next = path_comp.begin();
+  int exists;
+  while (1) {
+    int r = path_exists(*path, &exists);
+    if (r < 0)
+      return r;
+    if (!exists) {
+      if (path->empty())
+	return -ENOENT;
+      path->pop_back();
+      break;
+    }
+    if (next == path_comp.end())
+      break;
+    path->push_back(*(next++));
+  }
+  return get_mangled_name(*path, oid, mangled_name, hardlink);
+}
+
+int HashIndex::_collection_list_partial(const ghobject_t &start,
+					const ghobject_t &end,
+					int max_count,
+					vector<ghobject_t> *ls,
+					ghobject_t *next) {
+  vector<string> path;
+  ghobject_t _next;
+  if (!next)
+    next = &_next;
+  *next = start;
+  dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
+  return list_by_hash(path, end, max_count, next, ls);
+}
+
+int HashIndex::prep_delete() {
+  return recursive_remove(vector<string>());
+}
+
+int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
+  int ret;
+  vector<string> path;
+  subdir_info_s root_info;
+  // Make sure there is neither objects nor sub-folders
+  // in this collection
+  ret = get_info(path, &root_info);
+  if (ret < 0)
+    return ret;
+
+  // Do the folder splitting first
+  ret = pre_split_folder(pg_num, expected_num_objs);
+  if (ret < 0)
+    return ret;
+  // Initialize the folder info starting from root
+  return init_split_folder(path, 0);
+}
+
+int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
+{
+  // If folder merging is enabled (by setting the threshold positive),
+  // no need to split
+  if (merge_threshold > 0)
+    return 0;
+  const coll_t c = coll();
+  // Do not split if the expected number of objects in this collection is zero (by default)
+  if (expected_num_objs == 0)
+    return 0;
+
+  // Calculate the number of leaf folders (which actually store files)
+  // need to be created
+  const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
+  uint64_t leavies = expected_num_objs / objs_per_folder ;
+  // No need to split
+  if (leavies == 0 || expected_num_objs == objs_per_folder)
+    return 0;
+
+  spg_t spgid;
+  if (!c.is_pg_prefix(&spgid))
+    return -EINVAL;
+  const ps_t ps = spgid.pgid.ps();
+
+  // the most significant bits of pg_num
+  const int pg_num_bits = calc_num_bits(pg_num - 1);
+  ps_t tmp_id = ps;
+  // calculate the number of levels we only create one sub folder
+  int num = pg_num_bits / 4;
+  // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
+  // so that splitting starts at level 3
+  if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
+    --num;
+  }
+
+  int ret;
+  // Start with creation that only has one subfolder
+  vector<string> paths;
+  int dump_num = num;
+  while (num-- > 0) {
+    ps_t v = tmp_id & 0x0000000f;
+    paths.push_back(to_hex(v));
+    ret = create_path(paths);
+    if (ret < 0 && ret != -EEXIST)
+      return ret;
+    tmp_id = tmp_id >> 4;
+  }
+
+  // Starting from here, we can split by creating multiple subfolders
+  const int left_bits = pg_num_bits - dump_num * 4;
+  // this variable denotes how many bits (for this level) that can be
+  // used for sub folder splitting
+  int split_bits = 4 - left_bits;
+  // the below logic is inspired by rados.h#ceph_stable_mod,
+  // it basically determines how many sub-folders should we
+  // create for splitting
+  ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT
+  if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
+    ++split_bits;
+  }
+  const uint32_t subs = (1 << split_bits);
+  // Calculate how many levels we create starting from here
+  int level  = 0;
+  int level_limit = MAX_HASH_LEVEL - dump_num - 1;
+  uint64_t actual_leaves = subs;
+  while (actual_leaves < leavies && level < level_limit) {
+    ++level;
+    actual_leaves <<= 4;
+  }
+  for (uint32_t i = 0; i < subs; ++i) {
+    ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT
+    int v = tmp_id | (i << ((4 - split_bits) % 4));
+    paths.push_back(to_hex(v));
+    ret = create_path(paths);
+    if (ret < 0 && ret != -EEXIST)
+      return ret;
+    ret = recursive_create_path(paths, level);
+    if (ret < 0)
+      return ret;
+    paths.pop_back();
+  }
+  return 0;
+}
+
+int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
+{
+  // Get the number of sub directories for the current path
+  vector<string> subdirs;
+  int ret = list_subdirs(path, &subdirs);
+  if (ret < 0)
+    return ret;
+  subdir_info_s info;
+  info.subdirs = subdirs.size();
+  info.hash_level = hash_level;
+  ret = set_info(path, info);
+  if (ret < 0)
+    return ret;
+  ret = fsync_dir(path);
+  if (ret < 0)
+    return ret;
+
+  // Do the same for subdirs
+  vector<string>::const_iterator iter;
+  for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
+    path.push_back(*iter);
+    ret = init_split_folder(path, hash_level + 1);
+    if (ret < 0)
+      return ret;
+    path.pop_back();
+  }
+  return 0;
+}
+
+int HashIndex::recursive_create_path(vector<string>& path, int level)
+{
+  if (level == 0)
+    return 0;
+  for (int i = 0; i < 16; ++i) {
+    path.push_back(to_hex(i));
+    int ret = create_path(path);
+    if (ret < 0 && ret != -EEXIST)
+      return ret;
+    ret = recursive_create_path(path, level - 1);
+    if (ret < 0)
+      return ret;
+    path.pop_back();
+  }
+  return 0;
+}
+
+int HashIndex::recursive_remove(const vector<string> &path) {
+  return _recursive_remove(path, true);
+}
+
+int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
+  vector<string> subdirs;
+  dout(20) << __func__ << " path=" << path << dendl;
+  int r = list_subdirs(path, &subdirs);
+  if (r < 0)
+    return r;
+  map<string, ghobject_t> objects;
+  r = list_objects(path, 0, 0, &objects);
+  if (r < 0)
+    return r;
+  if (!objects.empty())
+    return -ENOTEMPTY;
+  vector<string> subdir(path);
+  for (vector<string>::iterator i = subdirs.begin();
+       i != subdirs.end();
+       ++i) {
+    subdir.push_back(*i);
+    r = _recursive_remove(subdir, false);
+    if (r < 0)
+      return r;
+    subdir.pop_back();
+  }
+  if (top)
+    return 0;
+  else
+    return remove_path(path);
+}
+
+int HashIndex::start_col_split(const vector<string> &path) {
+  bufferlist bl;
+  InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
+  op_tag.encode(bl);
+  int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0)
+    return r;
+  return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_split(const vector<string> &path) {
+  bufferlist bl;
+  InProgressOp op_tag(InProgressOp::SPLIT, path);
+  op_tag.encode(bl);
+  int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0)
+    return r;
+  return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_merge(const vector<string> &path) {
+  bufferlist bl;
+  InProgressOp op_tag(InProgressOp::MERGE, path);
+  op_tag.encode(bl);
+  int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0)
+    return r;
+  return fsync_dir(vector<string>());
+}
+
+int HashIndex::end_split_or_merge(const vector<string> &path) {
+  return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
+}
+
+int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
+  bufferlist buf;
+  int r = get_attr_path(path, SUBDIR_ATTR, buf);
+  if (r < 0)
+    return r;
+  auto bufiter = buf.cbegin();
+  info->decode(bufiter);
+  ceph_assert(path.size() == (unsigned)info->hash_level);
+  return 0;
+}
+
+int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
+  bufferlist buf;
+  ceph_assert(path.size() == (unsigned)info.hash_level);
+  info.encode(buf);
+  return add_attr_path(path, SUBDIR_ATTR, buf);
+}
+
+bool HashIndex::must_merge(const subdir_info_s &info) {
+  return (info.hash_level > 0 &&
+          merge_threshold > 0 &&
+	  info.objs < (unsigned)merge_threshold &&
+	  info.subdirs == 0);
+}
+
+bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
+  // target_level is used for ceph-objectstore-tool to split dirs offline.
+  // if it is set (defalult is 0) and current hash level < target_level, 
+  // this dir would be split no matters how many objects it has.
+  return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
+         ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
+         (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
+}
+
+int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
+  return start_merge(path);
+}
+
+int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
+  vector<string> dst = path;
+  dst.pop_back();
+  subdir_info_s dstinfo;
+  int r, exists;
+  r = path_exists(path, &exists);
+  if (r < 0)
+    return r;
+  r = get_info(dst, &dstinfo);
+  if (r < 0)
+    return r;
+  if (exists) {
+    r = move_objects(path, dst);
+    if (r < 0)
+      return r;
+    r = reset_attr(dst);
+    if (r < 0)
+      return r;
+    r = remove_path(path);
+    if (r < 0)
+      return r;
+  }
+  if (must_merge(dstinfo)) {
+    r = initiate_merge(dst, dstinfo);
+    if (r < 0)
+      return r;
+    r = fsync_dir(dst);
+    if (r < 0)
+      return r;
+    return complete_merge(dst, dstinfo);
+  }
+  r = fsync_dir(dst);
+  if (r < 0)
+    return r;
+  return end_split_or_merge(path);
+}
+
+int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
+  return start_split(path);
+}
+
+int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
+  int level = info.hash_level;
+  map<string, ghobject_t> objects;
+  vector<string> dst = path;
+  int r;
+  dst.push_back("");
+  r = list_objects(path, 0, 0, &objects);
+  if (r < 0)
+    return r;
+  vector<string> subdirs_vec;
+  r = list_subdirs(path, &subdirs_vec);
+  if (r < 0)
+    return r;
+  set<string> subdirs;
+  subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
+  map<string, map<string, ghobject_t> > mapped;
+  map<string, ghobject_t> moved;
+  int num_moved = 0;
+  for (map<string, ghobject_t>::iterator i = objects.begin();
+       i != objects.end();
+       ++i) {
+    vector<string> new_path;
+    get_path_components(i->second, &new_path);
+    mapped[new_path[level]][i->first] = i->second;
+  }
+  for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
+       i != mapped.end();
+       ) {
+    dst[level] = i->first;
+    /* If the info already exists, it must be correct,
+     * we may be picking up a partially finished split */
+    subdir_info_s temp;
+    // subdir has already been fully copied
+    if (subdirs.count(i->first) && !get_info(dst, &temp)) {
+      for (map<string, ghobject_t>::iterator j = i->second.begin();
+	   j != i->second.end();
+	   ++j) {
+	moved[j->first] = j->second;
+	num_moved++;
+	objects.erase(j->first);
+      }
+      ++i;
+      continue;
+    }
+
+    subdir_info_s info_new;
+    info_new.objs = i->second.size();
+    info_new.subdirs = 0;
+    info_new.hash_level = level + 1;
+    if (must_merge(info_new) && !subdirs.count(i->first)) {
+      mapped.erase(i++);
+      continue;
+    }
+
+    // Subdir doesn't yet exist
+    if (!subdirs.count(i->first)) {
+      info.subdirs += 1;
+      r = create_path(dst);
+      if (r < 0)
+	return r;
+    } // else subdir has been created but only partially copied
+
+    for (map<string, ghobject_t>::iterator j = i->second.begin();
+	 j != i->second.end();
+	 ++j) {
+      moved[j->first] = j->second;
+      num_moved++;
+      objects.erase(j->first);
+      r = link_object(path, dst, j->second, j->first);
+      // May be a partially finished split
+      if (r < 0 && r != -EEXIST) {
+	return r;
+      }
+    }
+
+    r = fsync_dir(dst);
+    if (r < 0)
+      return r;
+
+    // Presence of info must imply that all objects have been copied
+    r = set_info(dst, info_new);
+    if (r < 0)
+      return r;
+
+    r = fsync_dir(dst);
+    if (r < 0)
+      return r;
+
+    ++i;
+  }
+  r = remove_objects(path, moved, &objects);
+  if (r < 0)
+    return r;
+  info.objs = objects.size();
+  r = reset_attr(path);
+  if (r < 0)
+    return r;
+  r = fsync_dir(path);
+  if (r < 0)
+    return r;
+  return end_split_or_merge(path);
+}
+
+void HashIndex::get_path_components(const ghobject_t &oid,
+				    vector<string> *path) {
+  char buf[MAX_HASH_LEVEL + 1];
+  snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
+
+  // Path components are the hex characters of oid.hobj.hash, least
+  // significant first
+  for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+    path->push_back(string(&buf[i], 1));
+  }
+}
+
+string HashIndex::get_hash_str(uint32_t hash) {
+  char buf[MAX_HASH_LEVEL + 1];
+  snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
+  string retval;
+  for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+    retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
+  }
+  return retval;
+}
+
+string HashIndex::get_path_str(const ghobject_t &oid) {
+  ceph_assert(!oid.is_max());
+  return get_hash_str(oid.hobj.get_hash());
+}
+
+uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
+  while (prefix.size() < sizeof(uint32_t) * 2) {
+    prefix.push_back('0');
+  }
+  uint32_t hash;
+  sscanf(prefix.c_str(), "%x", &hash);
+  // nibble reverse
+  hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
+  hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
+  hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
+  return hash;
+}
+
+int HashIndex::get_path_contents_by_hash_bitwise(
+  const vector<string> &path,
+  const ghobject_t *next_object,
+  set<string, CmpHexdigitStringBitwise> *hash_prefixes,
+  set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
+{
+  map<string, ghobject_t> rev_objects;
+  int r;
+  r = list_objects(path, 0, 0, &rev_objects);
+  if (r < 0)
+    return r;
+  // bitwise sort
+  for (map<string, ghobject_t>::iterator i = rev_objects.begin();
+       i != rev_objects.end();
+       ++i) {
+    if (next_object && i->second < *next_object)
+      continue;
+    string hash_prefix = get_path_str(i->second);
+    hash_prefixes->insert(hash_prefix);
+    objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
+  }
+  vector<string> subdirs;
+  r = list_subdirs(path, &subdirs);
+  if (r < 0)
+    return r;
+
+  // sort subdirs bitwise (by reversing hex digit nibbles)
+  std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
+
+  // Local to this function, we will convert the prefix strings
+  // (previously simply the reversed hex digits) to also have each
+  // digit's nibbles reversed.  This will make the strings sort
+  // bitwise.
+  string cur_prefix;
+  for (vector<string>::const_iterator i = path.begin();
+       i != path.end();
+       ++i) {
+    cur_prefix.append(reverse_hexdigit_bits_string(*i));
+  }
+  string next_object_string;
+  if (next_object)
+    next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
+  for (vector<string>::iterator i = subdirs.begin();
+       i != subdirs.end();
+       ++i) {
+    string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
+    if (next_object) {
+      if (next_object->is_max())
+	continue;
+      if (candidate < next_object_string.substr(0, candidate.size()))
+	continue;
+    }
+    // re-reverse the hex digit nibbles for the caller
+    hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
+  }
+  return 0;
+}
+
+int HashIndex::list_by_hash(const vector<string> &path,
+			    const ghobject_t &end,
+			    int max_count,
+			    ghobject_t *next,
+			    vector<ghobject_t> *out)
+{
+  ceph_assert(out);
+  return list_by_hash_bitwise(path, end, max_count, next, out);
+}
+
+int HashIndex::list_by_hash_bitwise(
+  const vector<string> &path,
+  const ghobject_t& end,
+  int max_count,
+  ghobject_t *next,
+  vector<ghobject_t> *out)
+{
+  vector<string> next_path = path;
+  next_path.push_back("");
+  set<string, CmpHexdigitStringBitwise> hash_prefixes;
+  set<pair<string, ghobject_t>, CmpPairBitwise> objects;
+  int r = get_path_contents_by_hash_bitwise(path,
+					    next,
+					    &hash_prefixes,
+					    &objects);
+  if (r < 0)
+    return r;
+  for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
+       i != hash_prefixes.end();
+       ++i) {
+    dout(20) << __func__ << " prefix " << *i << dendl;
+    set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
+      make_pair(*i, ghobject_t()));
+    if (j == objects.end() || j->first != *i) {
+      *(next_path.rbegin()) = *(i->rbegin());
+      ghobject_t next_recurse;
+      if (next)
+	next_recurse = *next;
+      r = list_by_hash_bitwise(next_path,
+			       end,
+			       max_count,
+			       &next_recurse,
+			       out);
+
+      if (r < 0)
+	return r;
+      if (!next_recurse.is_max()) {
+	if (next)
+	  *next = next_recurse;
+	return 0;
+      }
+    } else {
+      while (j != objects.end() && j->first == *i) {
+	if (max_count > 0 && out->size() == (unsigned)max_count) {
+	  if (next)
+	    *next = j->second;
+	  return 0;
+	}
+	if (j->second >= end) {
+	  if (next)
+	    *next = j->second;
+	  return 0;
+	}
+	if (!next || j->second >= *next) {
+	  dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
+	  out->push_back(j->second);
+	}
+	++j;
+      }
+    }
+  }
+  if (next)
+    *next = ghobject_t::get_max();
+  return 0;
+}
+
+
diff --git a/src/os/filestore/HashIndex.h b/src/os/filestore/HashIndex.h
new file mode 100644
index 00000000..7e34d155
--- /dev/null
+++ b/src/os/filestore/HashIndex.h
@@ -0,0 +1,462 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_HASHINDEX_H
+#define CEPH_HASHINDEX_H
+
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "LFNIndex.h"
+
+extern string reverse_hexdigit_bits_string(string l);
+
+/**
+ * Implements collection prehashing.
+ *
+ * @verbatim
+ *     (root) - 0 - 0
+ *                - 1
+ *                - E
+ *            - 1
+ *            - 2 - D - 0
+ *            .
+ *            .
+ *            .
+ *            - F - 0
+ * @endverbatim
+ *
+ * A file is located at the longest existing directory from the root
+ * given by the hex characters in the hash beginning with the least
+ * significant.
+ *
+ * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
+ * would be located in (root)/2/D/0/
+ *
+ * Subdirectories are created when the number of objects in a
+ * directory exceed 16 * (abs(merge_threshhold) * split_multiplier +
+ * split_rand_factor). The number of objects in a directory is encoded
+ * as subdir_info_s in an xattr on the directory.
+ */
+class HashIndex : public LFNIndex {
+private:
+  /// Attribute name for storing subdir info @see subdir_info_s
+  static const string SUBDIR_ATTR;
+  /// Attribute name for storing index-wide settings
+  static const string SETTINGS_ATTR;
+  /// Attribute name for storing in progress op tag
+  static const string IN_PROGRESS_OP_TAG;
+  /// Size (bits) in object hash
+  static const int PATH_HASH_LEN = 32;
+  /// Max length of hashed path
+  static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4);
+
+  /**
+   * Merges occur when the number of object drops below
+   * merge_threshold and splits occur when the number of objects
+   * exceeds:
+   *
+   *   16 * (abs(merge_threshold) * split_multiplier + split_rand_factor)
+   *
+   * Please note if merge_threshold is less than zero, it will never
+   * do merging
+   */
+  int merge_threshold;
+  int split_multiplier;
+
+  /// Encodes current subdir state for determining when to split/merge.
+  struct subdir_info_s {
+    uint64_t objs;       ///< Objects in subdir.
+    uint32_t subdirs;    ///< Subdirs in subdir.
+    uint32_t hash_level; ///< Hashlevel of subdir.
+
+    subdir_info_s() : objs(0), subdirs(0), hash_level(0) {}
+
+    void encode(bufferlist &bl) const
+    {
+      using ceph::encode;
+      __u8 v = 1;
+      encode(v, bl);
+      encode(objs, bl);
+      encode(subdirs, bl);
+      encode(hash_level, bl);
+    }
+
+    void decode(bufferlist::const_iterator &bl)
+    {
+      using ceph::decode;
+      __u8 v;
+      decode(v, bl);
+      ceph_assert(v == 1);
+      decode(objs, bl);
+      decode(subdirs, bl);
+      decode(hash_level, bl);
+    }
+  };
+
+  struct settings_s {
+    uint32_t split_rand_factor; ///< random factor added to split threshold (only on root of collection)
+    settings_s() : split_rand_factor(0) {}
+    void encode(bufferlist &bl) const
+    {
+      using ceph::encode;
+      __u8 v = 1;
+      encode(v, bl);
+      encode(split_rand_factor, bl);
+    }
+    void decode(bufferlist::const_iterator &bl)
+    {
+      using ceph::decode;
+      __u8 v;
+      decode(v, bl);
+      decode(split_rand_factor, bl);
+    }
+  } settings;
+
+  /// Encodes in progress split or merge
+  struct InProgressOp {
+    static const int SPLIT = 0;
+    static const int MERGE = 1;
+    static const int COL_SPLIT = 2;
+    int op;
+    vector<string> path;
+
+    InProgressOp(int op, const vector<string> &path)
+      : op(op), path(path) {}
+
+    explicit InProgressOp(bufferlist::const_iterator &bl) {
+      decode(bl);
+    }
+
+    bool is_split() const { return op == SPLIT; }
+    bool is_col_split() const { return op == COL_SPLIT; }
+    bool is_merge() const { return op == MERGE; }
+
+    void encode(bufferlist &bl) const {
+      using ceph::encode;
+      __u8 v = 1;
+      encode(v, bl);
+      encode(op, bl);
+      encode(path, bl);
+    }
+
+    void decode(bufferlist::const_iterator &bl) {
+      using ceph::decode;
+      __u8 v;
+      decode(v, bl);
+      ceph_assert(v == 1);
+      decode(op, bl);
+      decode(path, bl);
+    }
+  };
+
+
+public:
+  /// Constructor.
+  HashIndex(
+    CephContext* cct,
+    coll_t collection,     ///< [in] Collection
+    const char *base_path, ///< [in] Path to the index root.
+    int merge_at,          ///< [in] Merge threshold.
+    int split_multiple,	   ///< [in] Split threshold.
+    uint32_t index_version,///< [in] Index version
+    double retry_probability=0) ///< [in] retry probability
+    : LFNIndex(cct, collection, base_path, index_version, retry_probability),
+      merge_threshold(merge_at),
+      split_multiplier(split_multiple)
+  {}
+
+  int read_settings() override;
+
+  /// @see CollectionIndex
+  uint32_t collection_version() override { return index_version; }
+
+  /// @see CollectionIndex
+  int cleanup() override;
+
+  /// @see CollectionIndex
+  int prep_delete() override;
+
+  /// @see CollectionIndex
+  int _split(
+    uint32_t match,
+    uint32_t bits,
+    CollectionIndex* dest
+    ) override;
+
+  /// @see CollectionIndex
+  int _merge(
+    uint32_t bits,
+    CollectionIndex* dest
+    ) override;
+
+  int _merge_dirs(
+    HashIndex& from,
+    HashIndex& to,
+    const vector<string>& path);
+
+  /// @see CollectionIndex
+  int apply_layout_settings(int target_level) override;
+
+protected:
+  int _init() override;
+
+  int _created(
+    const vector<string> &path,
+    const ghobject_t &oid,
+    const string &mangled_name
+    ) override;
+  int _remove(
+    const vector<string> &path,
+    const ghobject_t &oid,
+    const string &mangled_name
+    ) override;
+  int _lookup(
+    const ghobject_t &oid,
+    vector<string> *path,
+    string *mangled_name,
+    int *hardlink
+    ) override;
+
+  /**
+   * Pre-hash the collection to create folders according to the expected number
+   * of objects in this collection.
+   */
+  int _pre_hash_collection(
+      uint32_t pg_num,
+      uint64_t expected_num_objs
+      ) override;
+
+  int _collection_list_partial(
+    const ghobject_t &start,
+    const ghobject_t &end,
+    int max_count,
+    vector<ghobject_t> *ls,
+    ghobject_t *next
+    ) override;
+private:
+  /// Internal recursively remove path and its subdirs
+  int _recursive_remove(
+    const vector<string> &path, ///< [in] path to remove
+    bool top			///< [in] internal tracking of first caller
+    ); /// @return Error Code, 0 on success
+  /// Recursively remove path and its subdirs
+  int recursive_remove(
+    const vector<string> &path ///< [in] path to remove
+    ); /// @return Error Code, 0 on success
+  /// Tag root directory at beginning of col_split
+  int start_col_split(
+    const vector<string> &path ///< [in] path to split
+    ); ///< @return Error Code, 0 on success
+  /// Tag root directory at beginning of split
+  int start_split(
+    const vector<string> &path ///< [in] path to split
+    ); ///< @return Error Code, 0 on success
+  /// Tag root directory at beginning of split
+  int start_merge(
+    const vector<string> &path ///< [in] path to merge
+    ); ///< @return Error Code, 0 on success
+  /// Remove tag at end of split or merge
+  int end_split_or_merge(
+    const vector<string> &path ///< [in] path to split or merged
+    ); ///< @return Error Code, 0 on success
+  /// Gets info from the xattr on the subdir represented by path
+  int get_info(
+    const vector<string> &path, ///< [in] Path from which to read attribute.
+    subdir_info_s *info		///< [out] Attribute value
+    ); /// @return Error Code, 0 on success
+
+  /// Sets info to the xattr on the subdir represented by path
+  int set_info(
+    const vector<string> &path, ///< [in] Path on which to set attribute.
+    const subdir_info_s &info  	///< [in] Value to set
+    ); /// @return Error Code, 0 on success
+
+  /// Encapsulates logic for when to split.
+  bool must_merge(
+    const subdir_info_s &info ///< [in] Info to check
+    ); /// @return True if info must be merged, False otherwise
+
+  /// Encapsulates logic for when to merge.
+  bool must_split(
+    const subdir_info_s &info, ///< [in] Info to check
+    int target_level = 0
+    ); /// @return True if info must be split, False otherwise
+
+  /// Initiates merge
+  int initiate_merge(
+    const vector<string> &path, ///< [in] Subdir to merge
+    subdir_info_s info		///< [in] Info attached to path
+    ); /// @return Error Code, 0 on success
+
+  /// Completes merge
+  int complete_merge(
+    const vector<string> &path, ///< [in] Subdir to merge
+    subdir_info_s info		///< [in] Info attached to path
+    ); /// @return Error Code, 0 on success
+
+  /// Resets attr to match actual subdir contents
+  int reset_attr(
+    const vector<string> &path ///< [in] path to cleanup
+    );
+
+  /// Initiate Split
+  int initiate_split(
+    const vector<string> &path, ///< [in] Subdir to split
+    subdir_info_s info		///< [in] Info attached to path
+    ); /// @return Error Code, 0 on success
+
+  /// Completes Split
+  int complete_split(
+    const vector<string> &path, ///< [in] Subdir to split
+    subdir_info_s info	       ///< [in] Info attached to path
+    ); /// @return Error Code, 0 on success
+
+  /// Determine path components from hoid hash
+  void get_path_components(
+    const ghobject_t &oid, ///< [in] Object for which to get path components
+    vector<string> *path   ///< [out] Path components for hoid.
+    );
+
+  /// Pre-hash and split folders to avoid runtime splitting
+  /// according to the given expected object number.
+  int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs);
+
+  /// Initialize the folder (dir info) with the given hash
+  /// level and number of its subdirs.
+  int init_split_folder(vector<string> &path, uint32_t hash_level);
+
+  /// do collection split for path
+  static int col_split_level(
+    HashIndex &from,            ///< [in] from index
+    HashIndex &dest,            ///< [in] to index
+    const vector<string> &path, ///< [in] path to split
+    uint32_t bits,              ///< [in] num bits to match
+    uint32_t match,             ///< [in] bits to match
+    unsigned *mkdirred          ///< [in,out] path[:mkdirred] has been mkdirred
+    );
+
+
+  /**
+   * Get string representation of ghobject_t/hash
+   *
+   * e.g: 0x01234567 -> "76543210"
+   */
+  static string get_path_str(
+    const ghobject_t &oid ///< [in] Object to get hash string for
+    ); ///< @return Hash string for hoid.
+
+  /// Get string from hash, @see get_path_str
+  static string get_hash_str(
+    uint32_t hash ///< [in] Hash to convert to a string.
+    ); ///< @return String representation of hash
+
+  /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00
+  static uint32_t hash_prefix_to_hash(
+    string prefix ///< [in] string to convert
+    ); ///< @return Hash
+
+  /// Get hash mod from path
+  static void path_to_hobject_hash_prefix(
+    const vector<string> &path,///< [in] path to convert
+    uint32_t *bits,            ///< [out] bits
+    uint32_t *hash             ///< [out] hash
+    ) {
+    string hash_str;
+    for (vector<string>::const_iterator i = path.begin();
+	 i != path.end();
+	 ++i) {
+      hash_str.push_back(*i->begin());
+    }
+    uint32_t rev_hash = hash_prefix_to_hash(hash_str);
+    if (hash)
+      *hash = rev_hash;
+    if (bits)
+      *bits = path.size() * 4;
+  }
+
+  /// Calculate the number of bits.
+  static int calc_num_bits(uint64_t n) {
+    int ret = 0;
+    while (n > 0) {
+      n = n >> 1;
+      ret++;
+    }
+    return ret;
+  }
+
+  /// Convert a number to hex string (upper case).
+  static string to_hex(int n) {
+    ceph_assert(n >= 0 && n < 16);
+    char c = (n <= 9 ? ('0' + n) : ('A' + n - 10));
+    string str;
+    str.append(1, c);
+    return str;
+  }
+
+  struct CmpPairBitwise {
+    bool operator()(const pair<string, ghobject_t>& l,
+		    const pair<string, ghobject_t>& r) const
+    {
+      if (l.first < r.first)
+	return true;
+      if (l.first > r.first)
+	return false;
+      if (cmp(l.second, r.second) < 0)
+	return true;
+      return false;
+    }
+  };
+
+  struct CmpHexdigitStringBitwise {
+    bool operator()(const string& l, const string& r) const {
+      return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r);
+    }
+  };
+
+  /// Get path contents by hash
+  int get_path_contents_by_hash_bitwise(
+    const vector<string> &path,             /// [in] Path to list
+    const ghobject_t *next_object,          /// [in] list > *next_object
+    set<string, CmpHexdigitStringBitwise> *hash_prefixes, /// [out] prefixes in dir
+    set<pair<string, ghobject_t>, CmpPairBitwise> *objects /// [out] objects
+    );
+
+  /// List objects in collection in ghobject_t order
+  int list_by_hash(
+    const vector<string> &path, /// [in] Path to list
+    const ghobject_t &end,      /// [in] List only objects < end
+    int max_count,              /// [in] List at most max_count
+    ghobject_t *next,            /// [in,out] List objects >= *next
+    vector<ghobject_t> *out      /// [out] Listed objects
+    ); ///< @return Error Code, 0 on success
+  /// List objects in collection in ghobject_t order
+  int list_by_hash_bitwise(
+    const vector<string> &path, /// [in] Path to list
+    const ghobject_t &end,      /// [in] List only objects < end
+    int max_count,              /// [in] List at most max_count
+    ghobject_t *next,            /// [in,out] List objects >= *next
+    vector<ghobject_t> *out      /// [out] Listed objects
+    ); ///< @return Error Code, 0 on success
+
+  /// Create the given levels of sub directories from the given root.
+  /// The contents of *path* is not changed after calling this function.
+  int recursive_create_path(vector<string>& path, int level);
+
+  /// split each dir below the given path
+  int split_dirs(const vector<string> &path, int target_level = 0);
+
+  int write_settings();
+};
+
+#endif
diff --git a/src/os/filestore/IndexManager.cc b/src/os/filestore/IndexManager.cc
new file mode 100644
index 00000000..73095026
--- /dev/null
+++ b/src/os/filestore/IndexManager.cc
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/unordered_map.h"
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include <errno.h>
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+
+#include "IndexManager.h"
+#include "HashIndex.h"
+#include "CollectionIndex.h"
+
+#include "chain_xattr.h"
+
+static int set_version(const char *path, uint32_t version) {
+  bufferlist bl;
+  encode(version, bl);
+  return chain_setxattr<true, true>(
+    path, "user.cephos.collection_version", bl.c_str(),
+    bl.length());
+}
+
+static int get_version(const char *path, uint32_t *version) {
+  bufferptr bp(PATH_MAX);
+  int r = chain_getxattr(path, "user.cephos.collection_version",
+		      bp.c_str(), bp.length());
+  if (r < 0) {
+    if (r != -ENOENT) {
+      *version = 0;
+      return 0;
+    } else {
+      return r;
+    }
+  }
+  bp.set_length(r);
+  bufferlist bl;
+  bl.push_back(bp);
+  auto i = bl.cbegin();
+  decode(*version, i);
+  return 0;
+}
+
+IndexManager::~IndexManager() {
+
+  for (ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.begin();
+       it != col_indices.end(); ++it) {
+
+    delete it->second;
+    it->second = NULL;
+  }
+  col_indices.clear();
+}
+
+
+int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
+  RWLock::WLocker l(lock);
+  int r = set_version(path, version);
+  if (r < 0)
+    return r;
+  HashIndex index(cct, c, path, cct->_conf->filestore_merge_threshold,
+		  cct->_conf->filestore_split_multiple,
+		  version,
+		  cct->_conf->filestore_index_retry_probability);
+  r = index.init();
+  if (r < 0)
+    return r;
+  return index.read_settings();
+}
+
+int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) {
+  if (upgrade) {
+    // Need to check the collection generation
+    int r;
+    uint32_t version = 0;
+    r = get_version(path, &version);
+    if (r < 0)
+      return r;
+
+    switch (version) {
+    case CollectionIndex::FLAT_INDEX_TAG:
+    case CollectionIndex::HASH_INDEX_TAG: // fall through
+    case CollectionIndex::HASH_INDEX_TAG_2: // fall through
+    case CollectionIndex::HOBJECT_WITH_POOL: {
+      // Must be a HashIndex
+      *index = new HashIndex(cct, c, path,
+			     cct->_conf->filestore_merge_threshold,
+			     cct->_conf->filestore_split_multiple,
+			     version);
+      return (*index)->read_settings();
+    }
+    default: ceph_abort();
+    }
+
+  } else {
+    // No need to check
+    *index = new HashIndex(cct, c, path, cct->_conf->filestore_merge_threshold,
+			   cct->_conf->filestore_split_multiple,
+			   CollectionIndex::HOBJECT_WITH_POOL,
+			   cct->_conf->filestore_index_retry_probability);
+    return (*index)->read_settings();
+  }
+}
+
+bool IndexManager::get_index_optimistic(coll_t c, Index *index) {
+  RWLock::RLocker l(lock);
+  ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
+  if (it == col_indices.end()) 
+    return false;
+  index->index = it->second;
+  return true;
+}
+
+int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) {
+  if (get_index_optimistic(c, index))
+    return 0;
+  RWLock::WLocker l(lock);
+  ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
+  if (it == col_indices.end()) {
+    char path[PATH_MAX];
+    snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str());
+    CollectionIndex* colIndex = NULL;
+    int r = build_index(c, path, &colIndex);
+    if (r < 0)
+      return r;
+    col_indices[c] = colIndex;
+    index->index = colIndex;
+  } else {
+    index->index = it->second;
+  }
+  return 0;
+}
diff --git a/src/os/filestore/IndexManager.h b/src/os/filestore/IndexManager.h
new file mode 100644
index 00000000..19cd2926
--- /dev/null
+++ b/src/os/filestore/IndexManager.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef OS_INDEXMANAGER_H
+#define OS_INDEXMANAGER_H
+
+#include "include/unordered_map.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+
+#include "CollectionIndex.h"
+#include "HashIndex.h"
+
+
+/// Public type for Index
+struct Index {
+  CollectionIndex *index;
+
+  Index() : index(NULL) {}
+  explicit Index(CollectionIndex* index) : index(index) {}
+
+  CollectionIndex *operator->() { return index; }
+  CollectionIndex &operator*() { return *index; }
+};
+
+
+/**
+ * Encapsulates mutual exclusion for CollectionIndexes.
+ *
+ * Allowing a modification (removal or addition of an object) to occur
+ * while a read is occurring (lookup of an object's path and use of
+ * that path) may result in the path becoming invalid.  Thus, during
+ * the lifetime of a CollectionIndex object and any paths returned
+ * by it, no other concurrent accesses may be allowed.
+ * This is enforced by using CollectionIndex::access_lock
+ */
+class IndexManager {
+  CephContext* cct;
+  RWLock lock; ///< Lock for Index Manager
+  bool upgrade;
+  ceph::unordered_map<coll_t, CollectionIndex* > col_indices;
+
+  /**
+   * Index factory
+   *
+   * Encapsulates logic for handling legacy FileStore
+   * layouts
+   *
+   * @param [in] c Collection for which to get index
+   * @param [in] path Path to collection
+   * @param [out] index Index for c
+   * @return error code
+   */
+  int build_index(coll_t c, const char *path, CollectionIndex **index);
+  bool get_index_optimistic(coll_t c, Index *index);
+public:
+  /// Constructor
+  explicit IndexManager(CephContext* cct,
+			bool upgrade) : cct(cct),
+					lock("IndexManager lock"),
+					upgrade(upgrade) {}
+
+  ~IndexManager();
+
+  /**
+   * Reserve and return index for c
+   *
+   * @param [in] c Collection for which to get index
+   * @param [in] baseDir base directory of collections
+   * @param [out] index Index for c
+   * @return error code
+   */
+  int get_index(coll_t c, const string& baseDir, Index *index);
+
+  /**
+   * Initialize index for collection c at path
+   *
+   * @param [in] c Collection for which to init Index
+   * @param [in] path Path to collection
+   * @param [in] filestore_version version of containing FileStore
+   * @return error code
+   */
+  int init_index(coll_t c, const char *path, uint32_t filestore_version);
+};
+
+#endif
diff --git a/src/os/filestore/Journal.h b/src/os/filestore/Journal.h
new file mode 100644
index 00000000..cfb667d8
--- /dev/null
+++ b/src/os/filestore/Journal.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_JOURNAL_H
+#define CEPH_JOURNAL_H
+
+#include <errno.h>
+
+#include "include/buffer_fwd.h"
+#include "include/Context.h"
+#include "common/Finisher.h"
+#include "common/TrackedOp.h"
+#include "os/ObjectStore.h"
+#include "common/zipkin_trace.h"
+
+class PerfCounters;
+
+class Journal {
+protected:
+  uuid_d fsid;
+  Finisher *finisher;
+public:
+  CephContext* cct;
+  PerfCounters *logger;
+protected:
+  Cond *do_sync_cond;
+  bool wait_on_full;
+
+public:
+  Journal(CephContext* cct, uuid_d f, Finisher *fin, Cond *c=0) :
+    fsid(f), finisher(fin), cct(cct), logger(NULL),
+    do_sync_cond(c),
+    wait_on_full(false) { }
+  virtual ~Journal() { }
+
+  virtual int check() = 0;   ///< check if journal appears valid
+  virtual int create() = 0;  ///< create a fresh journal
+  virtual int open(uint64_t fs_op_seq) = 0;  ///< open an existing journal
+  virtual void close() = 0;  ///< close an open journal
+
+  virtual void flush() = 0;
+
+  virtual void get_devices(set<string> *ls) {}
+  virtual void collect_metadata(map<string,string> *pm) {}
+  /**
+   * reserve_throttle_and_backoff
+   *
+   * Implementation may throttle or backoff based on ops
+   * reserved here but not yet released using committed_thru.
+   */
+  virtual void reserve_throttle_and_backoff(uint64_t count) = 0;
+
+  virtual int dump(ostream& out) { return -EOPNOTSUPP; }
+
+  void set_wait_on_full(bool b) { wait_on_full = b; }
+
+  // writes
+  virtual bool is_writeable() = 0;
+  virtual int make_writeable() = 0;
+  virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
+			    Context *oncommit,
+			    TrackedOpRef osd_op = TrackedOpRef()) = 0;
+  virtual void commit_start(uint64_t seq) = 0;
+  virtual void committed_thru(uint64_t seq) = 0;
+
+  /// Read next journal entry - asserts on invalid journal
+  virtual bool read_entry(
+    bufferlist &bl, ///< [out] payload on successful read
+    uint64_t &seq   ///< [in,out] sequence number on last successful read
+    ) = 0; ///< @return true on successful read, false on journal end
+
+  virtual bool should_commit_now() = 0;
+
+  virtual int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) = 0;
+
+  virtual off64_t get_journal_size_estimate() { return 0; }
+
+  // reads/recovery
+
+};
+
+#endif
diff --git a/src/os/filestore/JournalThrottle.cc b/src/os/filestore/JournalThrottle.cc
new file mode 100644
index 00000000..8475bbbf
--- /dev/null
+++ b/src/os/filestore/JournalThrottle.cc
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "JournalThrottle.h"
+#include "include/ceph_assert.h"
+
+bool JournalThrottle::set_params(
+  double _low_threshhold,
+  double _high_threshhold,
+  double _expected_throughput,
+  double _high_multiple,
+  double _max_multiple,
+  uint64_t _throttle_max,
+  std::ostream *errstream)
+{
+  return throttle.set_params(
+    _low_threshhold,
+    _high_threshhold,
+    _expected_throughput,
+    _high_multiple,
+    _max_multiple,
+    _throttle_max,
+    errstream);
+}
+
+std::chrono::duration<double> JournalThrottle::get(uint64_t c)
+{
+  return throttle.get(c);
+}
+
+uint64_t JournalThrottle::take(uint64_t c)
+{
+  return throttle.take(c);
+}
+
+void JournalThrottle::register_throttle_seq(uint64_t seq, uint64_t c)
+{
+  locker l(lock);
+  journaled_ops.push_back(std::make_pair(seq, c));
+}
+
+std::pair<uint64_t, uint64_t> JournalThrottle::flush(uint64_t mono_id)
+{
+  uint64_t to_put_bytes = 0;
+  uint64_t to_put_ops = 0;
+  {
+    locker l(lock);
+    while (!journaled_ops.empty() &&
+	   journaled_ops.front().first <= mono_id) {
+      to_put_bytes += journaled_ops.front().second;
+      to_put_ops++;
+      journaled_ops.pop_front();
+    }
+  }
+  throttle.put(to_put_bytes);
+  return make_pair(to_put_ops, to_put_bytes);
+}
+
+uint64_t JournalThrottle::get_current()
+{
+  return throttle.get_current();
+}
+
+uint64_t JournalThrottle::get_max()
+{
+  return throttle.get_max();
+}
diff --git a/src/os/filestore/JournalThrottle.h b/src/os/filestore/JournalThrottle.h
new file mode 100644
index 00000000..75485d6d
--- /dev/null
+++ b/src/os/filestore/JournalThrottle.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_THROTTLE_H
+#define CEPH_JOURNAL_THROTTLE_H
+
+#include "common/Throttle.h"
+
+#include <list>
+#include <deque>
+#include <condition_variable>
+#include <thread>
+#include <vector>
+#include <chrono>
+#include <iostream>
+
+/**
+ * JournalThrottle
+ *
+ * Throttle designed to implement dynamic throttling as the journal fills
+ * up.  The goal is to not delay ops at all when the journal is relatively
+ * empty, delay ops somewhat as the journal begins to fill (with the delay
+ * getting linearly longer as the journal fills up to a high water mark),
+ * and to delay much more aggressively (though still linearly with usage)
+ * until we hit the max value.
+ *
+ * The implementation simply wraps BackoffThrottle with a queue of
+ * journaled but not synced ops.
+ *
+ * The usage pattern is as follows:
+ * 1) Call get(seq, bytes) before taking the op_queue_throttle
+ * 2) Once the journal is flushed, flush(max_op_id_flushed)
+ */
+class JournalThrottle {
+  BackoffThrottle throttle;
+
+  std::mutex lock;
+  /// deque<id, count>
+  std::deque<std::pair<uint64_t, uint64_t> > journaled_ops;
+  using locker = std::unique_lock<std::mutex>;
+
+public:
+  /**
+   * set_params
+   *
+   * Sets params.  If the params are invalid, returns false
+   * and populates errstream (if non-null) with a user compreshensible
+   * explanation.
+   */
+  bool set_params(
+    double low_threshhold,
+    double high_threshhold,
+    double expected_throughput,
+    double high_multiple,
+    double max_multiple,
+    uint64_t throttle_max,
+    std::ostream *errstream);
+
+  /**
+   * gets specified throttle for id mono_id, waiting as necessary
+   *
+   * @param c [in] amount to take
+   * @return duration waited
+   */
+  std::chrono::duration<double> get(uint64_t c);
+
+  /**
+   * take
+   *
+   * Takes specified throttle without waiting
+   */
+  uint64_t take(uint64_t c);
+
+  /**
+   * register_throttle_seq
+   *
+   * Registers a sequence number with an amount of throttle to
+   * release upon flush()
+   *
+   * @param seq [in] seq
+   */
+  void register_throttle_seq(uint64_t seq, uint64_t c);
+
+
+  /**
+   * Releases throttle held by ids <= mono_id
+   *
+   * @param mono_id [in] id up to which to flush
+   * @returns pair<ops_flushed, bytes_flushed>
+   */
+  std::pair<uint64_t, uint64_t> flush(uint64_t mono_id);
+
+  uint64_t get_current();
+  uint64_t get_max();
+
+  JournalThrottle(
+    unsigned expected_concurrency ///< [in] determines size of conds
+    ) : throttle(g_ceph_context, "filestore_journal", expected_concurrency) {}
+};
+
+#endif
diff --git a/src/os/filestore/JournalingObjectStore.cc b/src/os/filestore/JournalingObjectStore.cc
new file mode 100644
index 00000000..714d0935
--- /dev/null
+++ b/src/os/filestore/JournalingObjectStore.cc
@@ -0,0 +1,271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "JournalingObjectStore.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+
+
+void JournalingObjectStore::journal_start()
+{
+  dout(10) << "journal_start" << dendl;
+  finisher.start();
+}
+
+void JournalingObjectStore::journal_stop()
+{
+  dout(10) << "journal_stop" << dendl;
+  finisher.wait_for_empty();
+  finisher.stop();
+}
+
+// A journal_replay() makes journal writeable, this closes that out.
+void JournalingObjectStore::journal_write_close()
+{
+  if (journal) {
+    journal->close();
+    delete journal;
+    journal = 0;
+  }
+  apply_manager.reset();
+}
+
+int JournalingObjectStore::journal_replay(uint64_t fs_op_seq)
+{
+  dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl;
+
+  if (cct->_conf->journal_replay_from) {
+    dout(0) << "journal_replay forcing replay from "
+	    << cct->_conf->journal_replay_from
+	    << " instead of " << fs_op_seq << dendl;
+    // the previous op is the last one committed
+    fs_op_seq = cct->_conf->journal_replay_from - 1;
+  }
+
+  uint64_t op_seq = fs_op_seq;
+  apply_manager.init_seq(fs_op_seq);
+
+  if (!journal) {
+    submit_manager.set_op_seq(op_seq);
+    return 0;
+  }
+
+  int err = journal->open(op_seq);
+  if (err < 0) {
+    dout(3) << "journal_replay open failed with "
+	    << cpp_strerror(err) << dendl;
+    delete journal;
+    journal = 0;
+    return err;
+  }
+
+  replaying = true;
+
+  int count = 0;
+  while (1) {
+    bufferlist bl;
+    uint64_t seq = op_seq + 1;
+    if (!journal->read_entry(bl, seq)) {
+      dout(3) << "journal_replay: end of journal, done." << dendl;
+      break;
+    }
+
+    if (seq <= op_seq) {
+      dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl;
+      continue;
+    }
+    ceph_assert(op_seq == seq-1);
+
+    dout(3) << "journal_replay: applying op seq " << seq << dendl;
+    auto p = bl.cbegin();
+    vector<ObjectStore::Transaction> tls;
+    while (!p.end()) {
+      tls.emplace_back(Transaction(p));
+    }
+
+    apply_manager.op_apply_start(seq);
+    int r = do_transactions(tls, seq);
+    apply_manager.op_apply_finish(seq);
+
+    op_seq = seq;
+    count++;
+
+    dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl;
+  }
+
+  if (count)
+    dout(3) << "journal_replay: total = " << count << dendl;
+
+  replaying = false;
+
+  submit_manager.set_op_seq(op_seq);
+
+  // done reading, make writeable.
+  err = journal->make_writeable();
+  if (err < 0)
+    return err;
+
+  if (!count)
+    journal->committed_thru(fs_op_seq);
+
+  return count;
+}
+
+
+// ------------------------------------
+
+uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op)
+{
+  Mutex::Locker l(apply_lock);
+  while (blocked) {
+    dout(10) << "op_apply_start blocked, waiting" << dendl;
+    blocked_cond.Wait(apply_lock);
+  }
+  dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> "
+	   << (open_ops+1) << dendl;
+  ceph_assert(!blocked);
+  ceph_assert(op > committed_seq);
+  open_ops++;
+  return op;
+}
+
+void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op)
+{
+  Mutex::Locker l(apply_lock);
+  dout(10) << "op_apply_finish " << op << " open_ops " << open_ops << " -> "
+	   << (open_ops-1) << ", max_applied_seq " << max_applied_seq << " -> "
+	   << std::max(op, max_applied_seq) << dendl;
+  --open_ops;
+  ceph_assert(open_ops >= 0);
+
+  // signal a blocked commit_start
+  if (blocked) {
+    blocked_cond.Signal();
+  }
+
+  // there can be multiple applies in flight; track the max value we
+  // note.  note that we can't _read_ this value and learn anything
+  // meaningful unless/until we've quiesced all in-flight applies.
+  if (op > max_applied_seq)
+    max_applied_seq = op;
+}
+
+uint64_t JournalingObjectStore::SubmitManager::op_submit_start()
+{
+  lock.Lock();
+  uint64_t op = ++op_seq;
+  dout(10) << "op_submit_start " << op << dendl;
+  return op;
+}
+
+void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op)
+{
+  dout(10) << "op_submit_finish " << op << dendl;
+  if (op != op_submitted + 1) {
+    dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
+	    << ", OUT OF ORDER" << dendl;
+    ceph_abort_msg("out of order op_submit_finish");
+  }
+  op_submitted = op;
+  lock.Unlock();
+}
+
+
+// ------------------------------------------
+
+void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c)
+{
+  Mutex::Locker l(com_lock);
+  ceph_assert(c);
+  commit_waiters[op].push_back(c);
+}
+
+bool JournalingObjectStore::ApplyManager::commit_start()
+{
+  bool ret = false;
+
+  {
+    Mutex::Locker l(apply_lock);
+    dout(10) << "commit_start max_applied_seq " << max_applied_seq
+	     << ", open_ops " << open_ops << dendl;
+    blocked = true;
+    while (open_ops > 0) {
+      dout(10) << "commit_start waiting for " << open_ops
+	       << " open ops to drain" << dendl;
+      blocked_cond.Wait(apply_lock);
+    }
+    ceph_assert(open_ops == 0);
+    dout(10) << "commit_start blocked, all open_ops have completed" << dendl;
+    {
+      Mutex::Locker l(com_lock);
+      if (max_applied_seq == committed_seq) {
+	dout(10) << "commit_start nothing to do" << dendl;
+	blocked = false;
+	ceph_assert(commit_waiters.empty());
+	goto out;
+      }
+
+      committing_seq = max_applied_seq;
+
+      dout(10) << "commit_start committing " << committing_seq
+	       << ", still blocked" << dendl;
+    }
+  }
+  ret = true;
+
+  if (journal)
+    journal->commit_start(committing_seq);  // tell the journal too
+ out:
+  return ret;
+}
+
+void JournalingObjectStore::ApplyManager::commit_started()
+{
+  Mutex::Locker l(apply_lock);
+  // allow new ops. (underlying fs should now be committing all prior ops)
+  dout(10) << "commit_started committing " << committing_seq << ", unblocking"
+	   << dendl;
+  blocked = false;
+  blocked_cond.Signal();
+}
+
+void JournalingObjectStore::ApplyManager::commit_finish()
+{
+  Mutex::Locker l(com_lock);
+  dout(10) << "commit_finish thru " << committing_seq << dendl;
+
+  if (journal)
+    journal->committed_thru(committing_seq);
+
+  committed_seq = committing_seq;
+
+  map<version_t, vector<Context*> >::iterator p = commit_waiters.begin();
+  while (p != commit_waiters.end() &&
+    p->first <= committing_seq) {
+    finisher.queue(p->second);
+    commit_waiters.erase(p++);
+  }
+}
+
+void JournalingObjectStore::_op_journal_transactions(
+  bufferlist& tbl, uint32_t orig_len, uint64_t op,
+  Context *onjournal, TrackedOpRef osd_op)
+{
+  if (osd_op.get())
+    dout(10) << "op_journal_transactions " << op << " reqid_t "
+             << (static_cast<OpRequest *>(osd_op.get()))->get_reqid() << dendl;
+  else
+    dout(10) << "op_journal_transactions " << op  << dendl;
+
+  if (journal && journal->is_writeable()) {
+    journal->submit_entry(op, tbl, orig_len, onjournal, osd_op);
+  } else if (onjournal) {
+    apply_manager.add_waiter(op, onjournal);
+  }
+}
diff --git a/src/os/filestore/JournalingObjectStore.h b/src/os/filestore/JournalingObjectStore.h
new file mode 100644
index 00000000..a289d0e8
--- /dev/null
+++ b/src/os/filestore/JournalingObjectStore.h
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_JOURNALINGOBJECTSTORE_H
+#define CEPH_JOURNALINGOBJECTSTORE_H
+
+#include "os/ObjectStore.h"
+#include "Journal.h"
+#include "FileJournal.h"
+#include "common/RWLock.h"
+#include "osd/OpRequest.h"
+
+class JournalingObjectStore : public ObjectStore {
+protected:
+  Journal *journal;
+  Finisher finisher;
+
+
+  class SubmitManager {
+    CephContext* cct;
+    Mutex lock;
+    uint64_t op_seq;
+    uint64_t op_submitted;
+  public:
+    SubmitManager(CephContext* cct) :
+      cct(cct), lock("JOS::SubmitManager::lock", false, true, false),
+      op_seq(0), op_submitted(0)
+    {}
+    uint64_t op_submit_start();
+    void op_submit_finish(uint64_t op);
+    void set_op_seq(uint64_t seq) {
+      Mutex::Locker l(lock);
+      op_submitted = op_seq = seq;
+    }
+    uint64_t get_op_seq() {
+      return op_seq;
+    }
+  } submit_manager;
+
+  class ApplyManager {
+    CephContext* cct;
+    Journal *&journal;
+    Finisher &finisher;
+
+    Mutex apply_lock;
+    bool blocked;
+    Cond blocked_cond;
+    int open_ops;
+    uint64_t max_applied_seq;
+
+    Mutex com_lock;
+    map<version_t, vector<Context*> > commit_waiters;
+    uint64_t committing_seq, committed_seq;
+
+  public:
+    ApplyManager(CephContext* cct, Journal *&j, Finisher &f) :
+      cct(cct), journal(j), finisher(f),
+      apply_lock("JOS::ApplyManager::apply_lock", false, true, false),
+      blocked(false),
+      open_ops(0),
+      max_applied_seq(0),
+      com_lock("JOS::ApplyManager::com_lock", false, true, false),
+      committing_seq(0), committed_seq(0) {}
+    void reset() {
+      ceph_assert(open_ops == 0);
+      ceph_assert(blocked == false);
+      max_applied_seq = 0;
+      committing_seq = 0;
+      committed_seq = 0;
+    }
+    void add_waiter(uint64_t, Context*);
+    uint64_t op_apply_start(uint64_t op);
+    void op_apply_finish(uint64_t op);
+    bool commit_start();
+    void commit_started();
+    void commit_finish();
+    bool is_committing() {
+      Mutex::Locker l(com_lock);
+      return committing_seq != committed_seq;
+    }
+    uint64_t get_committed_seq() {
+      Mutex::Locker l(com_lock);
+      return committed_seq;
+    }
+    uint64_t get_committing_seq() {
+      Mutex::Locker l(com_lock);
+      return committing_seq;
+    }
+    void init_seq(uint64_t fs_op_seq) {
+      {
+	Mutex::Locker l(com_lock);
+	committed_seq = fs_op_seq;
+	committing_seq = fs_op_seq;
+      }
+      {
+	Mutex::Locker l(apply_lock);
+	max_applied_seq = fs_op_seq;
+      }
+    }
+  } apply_manager;
+
+  bool replaying;
+
+protected:
+  void journal_start();
+  void journal_stop();
+  void journal_write_close();
+  int journal_replay(uint64_t fs_op_seq);
+
+  void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op,
+				Context *onjournal, TrackedOpRef osd_op);
+
+  virtual int do_transactions(vector<ObjectStore::Transaction>& tls, uint64_t op_seq) = 0;
+
+public:
+  bool is_committing() {
+    return apply_manager.is_committing();
+  }
+  uint64_t get_committed_seq() {
+    return apply_manager.get_committed_seq();
+  }
+
+public:
+  JournalingObjectStore(CephContext* cct, const std::string& path)
+    : ObjectStore(cct, path),
+      journal(NULL),
+      finisher(cct, "JournalObjectStore", "fn_jrn_objstore"),
+      submit_manager(cct),
+      apply_manager(cct, journal, finisher),
+      replaying(false) {}
+
+  ~JournalingObjectStore() override {
+  }
+};
+
+#endif
diff --git a/src/os/filestore/LFNIndex.cc b/src/os/filestore/LFNIndex.cc
new file mode 100644
index 00000000..2451ae8c
--- /dev/null
+++ b/src/os/filestore/LFNIndex.cc
@@ -0,0 +1,1407 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <errno.h>
+#include <string.h>
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "include/compat.h"
+#include "chain_xattr.h"
+
+#include "LFNIndex.h"
+using ceph::crypto::SHA1;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") "
+
+
+const string LFNIndex::LFN_ATTR = "user.cephos.lfn";
+const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash.";
+const string LFNIndex::SUBDIR_PREFIX = "DIR_";
+const string LFNIndex::FILENAME_COOKIE = "long";
+const int LFNIndex::FILENAME_PREFIX_LEN =  FILENAME_SHORT_LEN - FILENAME_HASH_LEN -
+								FILENAME_COOKIE.size() -
+								FILENAME_EXTRA;
+void LFNIndex::maybe_inject_failure()
+{
+  if (error_injection_enabled) {
+    if (current_failure > last_failure &&
+	(((double)(rand() % 10000))/((double)(10000))
+	 < error_injection_probability)) {
+      last_failure = current_failure;
+      current_failure = 0;
+      throw RetryException();
+    }
+    ++current_failure;
+  }
+}
+
+// Helper to close fd's when we leave scope.  This is useful when used
+// in combination with RetryException, thrown by the above.
+struct FDCloser {
+  int fd;
+  explicit FDCloser(int f) : fd(f) {}
+  ~FDCloser() {
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+  }
+};
+
+
+/* Public methods */
+
+uint64_t LFNIndex::get_max_escaped_name_len(const hobject_t &obj)
+{
+  ghobject_t ghobj(obj);
+  ghobj.shard_id = shard_id_t(0);
+  ghobj.generation = 0;
+  ghobj.hobj.snap = 0;
+  return lfn_generate_object_name_current(ghobj).size();
+}
+
+int LFNIndex::init()
+{
+  return _init();
+}
+
+int LFNIndex::created(const ghobject_t &oid, const char *path)
+{
+  WRAP_RETRY(
+  vector<string> path_comp;
+  string short_name;
+  r = decompose_full_path(path, &path_comp, 0, &short_name);
+  if (r < 0)
+    goto out;
+  r = lfn_created(path_comp, oid, short_name);
+  if (r < 0) {
+    if (failed) {
+      /* This is hacky, but the only way we get ENOENT from lfn_created here is
+       * if we did a failure injection in _created below AND actually started the
+       * split or merge.  In that case, lfn_created already suceeded, and
+       * WRAP_RETRY already cleaned it up and we are actually done.  In a real
+       * failure, the filestore itself would have ended up calling this with
+       * the new path, not the old one, so we'd find it.
+       */
+      r = 0;
+    }
+    goto out;
+  }
+  r = _created(path_comp, oid, short_name);
+  if (r < 0)
+    goto out;
+    );
+}
+
+int LFNIndex::unlink(const ghobject_t &oid)
+{
+  WRAP_RETRY(
+  vector<string> path;
+  string short_name;
+  r = _lookup(oid, &path, &short_name, NULL);
+  if (r < 0) {
+    goto out;
+  }
+  r = _remove(path, oid, short_name);
+  if (r < 0) {
+    goto out;
+  }
+  );
+}
+
+int LFNIndex::lookup(const ghobject_t &oid,
+		     IndexedPath *out_path,
+		     int *hardlink)
+{
+  WRAP_RETRY(
+  vector<string> path;
+  string short_name;
+  r = _lookup(oid, &path, &short_name, hardlink);
+  if (r < 0)
+    goto out;
+  string full_path = get_full_path(path, short_name);
+  *out_path = std::make_shared<Path>(full_path, this);
+  r = 0;
+  );
+}
+
+int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
+{
+  return _pre_hash_collection(pg_num, expected_num_objs);
+}
+
+
+int LFNIndex::collection_list_partial(const ghobject_t &start,
+				      const ghobject_t &end,
+				      int max_count,
+				      vector<ghobject_t> *ls,
+				      ghobject_t *next)
+{
+  return _collection_list_partial(start, end, max_count, ls, next);
+}
+
+/* Derived class utility methods */
+
+int LFNIndex::fsync_dir(const vector<string> &path)
+{
+  maybe_inject_failure();
+  int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY|O_CLOEXEC);
+  if (fd < 0)
+    return -errno;
+  FDCloser f(fd);
+  maybe_inject_failure();
+  int r = ::fsync(fd);
+  maybe_inject_failure();
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  return 0;
+}
+
+int LFNIndex::link_object(const vector<string> &from,
+			  const vector<string> &to,
+			  const ghobject_t &oid,
+			  const string &from_short_name)
+{
+  int r;
+  string from_path = get_full_path(from, from_short_name);
+  string to_path;
+  maybe_inject_failure();
+  r = lfn_get_name(to, oid, 0, &to_path, 0);
+  if (r < 0)
+    return r;
+  maybe_inject_failure();
+  r = ::link(from_path.c_str(), to_path.c_str());
+  maybe_inject_failure();
+  if (r < 0)
+    return -errno;
+  else
+    return 0;
+}
+
+int LFNIndex::remove_objects(const vector<string> &dir,
+			     const map<string, ghobject_t> &to_remove,
+			     map<string, ghobject_t> *remaining)
+{
+  set<string> clean_chains;
+  for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin();
+       to_clean != to_remove.end();
+       ++to_clean) {
+    if (!lfn_is_hashed_filename(to_clean->first)) {
+      maybe_inject_failure();
+      int r = ::unlink(get_full_path(dir, to_clean->first).c_str());
+      maybe_inject_failure();
+      if (r < 0)
+	return -errno;
+      continue;
+    }
+    if (clean_chains.count(lfn_get_short_name(to_clean->second, 0)))
+      continue;
+    set<int> holes;
+    map<int, pair<string, ghobject_t> > chain;
+    for (int i = 0; ; ++i) {
+      string short_name = lfn_get_short_name(to_clean->second, i);
+      if (remaining->count(short_name)) {
+	chain[i] = *(remaining->find(short_name));
+      } else if (to_remove.count(short_name)) {
+	holes.insert(i);
+      } else {
+	break;
+      }
+    }
+
+    map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin();
+    for (set<int>::iterator i = holes.begin();
+	 i != holes.end();
+	 ++i) {
+      if (candidate == chain.rend() || *i > candidate->first) {
+	string remove_path_name =
+	  get_full_path(dir, lfn_get_short_name(to_clean->second, *i));
+	maybe_inject_failure();
+	int r = ::unlink(remove_path_name.c_str());
+	maybe_inject_failure();
+	if (r < 0)
+	  return -errno;
+	continue;
+      }
+      string from = get_full_path(dir, candidate->second.first);
+      string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i));
+      maybe_inject_failure();
+      int r = ::rename(from.c_str(), to.c_str());
+      maybe_inject_failure();
+      if (r < 0)
+	return -errno;
+      remaining->erase(candidate->second.first);
+      remaining->insert(pair<string, ghobject_t>(
+			  lfn_get_short_name(candidate->second.second, *i),
+					     candidate->second.second));
+      ++candidate;
+    }
+    if (!holes.empty())
+      clean_chains.insert(lfn_get_short_name(to_clean->second, 0));
+  }
+  return 0;
+}
+
+int LFNIndex::move_objects(const vector<string> &from,
+			   const vector<string> &to)
+{
+  map<string, ghobject_t> to_move;
+  int r;
+  r = list_objects(from, 0, NULL, &to_move);
+  if (r < 0)
+    return r;
+  for (map<string,ghobject_t>::iterator i = to_move.begin();
+       i != to_move.end();
+       ++i) {
+    string from_path = get_full_path(from, i->first);
+    string to_path, to_name;
+    r = lfn_get_name(to, i->second, &to_name, &to_path, 0);
+    if (r < 0)
+      return r;
+    maybe_inject_failure();
+    r = ::link(from_path.c_str(), to_path.c_str());
+    if (r < 0 && errno != EEXIST)
+      return -errno;
+    maybe_inject_failure();
+    r = lfn_created(to, i->second, to_name);
+    maybe_inject_failure();
+    if (r < 0)
+      return r;
+  }
+  r = fsync_dir(to);
+  if (r < 0)
+    return r;
+  for (map<string,ghobject_t>::iterator i = to_move.begin();
+       i != to_move.end();
+       ++i) {
+    maybe_inject_failure();
+    r = ::unlink(get_full_path(from, i->first).c_str());
+    maybe_inject_failure();
+    if (r < 0)
+      return -errno;
+  }
+  return fsync_dir(from);
+}
+
+int LFNIndex::remove_object(const vector<string> &from,
+			    const ghobject_t &oid)
+{
+  string short_name;
+  int r, exist;
+  maybe_inject_failure();
+  r = get_mangled_name(from, oid, &short_name, &exist);
+  maybe_inject_failure();
+  if (r < 0)
+    return r;
+  if (exist == 0)
+    return -ENOENT;
+  return lfn_unlink(from, oid, short_name);
+}
+
+int LFNIndex::get_mangled_name(const vector<string> &from,
+			       const ghobject_t &oid,
+			       string *mangled_name, int *hardlink)
+{
+  return lfn_get_name(from, oid, mangled_name, 0, hardlink);
+}
+
+int LFNIndex::move_subdir(
+  LFNIndex &from,
+  LFNIndex &dest,
+  const vector<string> &path,
+  string dir
+  )
+{
+  vector<string> sub_path(path.begin(), path.end());
+  sub_path.push_back(dir);
+  string from_path(from.get_full_path_subdir(sub_path));
+  string to_path(dest.get_full_path_subdir(sub_path));
+  int r = ::rename(from_path.c_str(), to_path.c_str());
+  if (r < 0)
+    return -errno;
+  return 0;
+}
+
+int LFNIndex::move_object(
+  LFNIndex &from,
+  LFNIndex &dest,
+  const vector<string> &path,
+  const pair<string, ghobject_t> &obj
+  )
+{
+  string from_path(from.get_full_path(path, obj.first));
+  string to_path;
+  string to_name;
+  int exists;
+  int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists);
+  if (r < 0)
+    return r;
+  if (!exists) {
+    r = ::link(from_path.c_str(), to_path.c_str());
+    if (r < 0)
+      return r;
+  }
+  r = dest.lfn_created(path, obj.second, to_name);
+  if (r < 0)
+    return r;
+  r = dest.fsync_dir(path);
+  if (r < 0)
+    return r;
+  r = from.remove_object(path, obj.second);
+  if (r < 0)
+    return r;
+  return from.fsync_dir(path);
+}
+
+
+static int get_hobject_from_oinfo(const char *dir, const char *file,
+				  ghobject_t *o)
+{
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s/%s", dir, file);
+  // Hack, user.ceph._ is the attribute used to store the object info
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    path,
+    "user.ceph._",
+    &bp);
+  if (r < 0)
+    return r;
+  bufferlist bl;
+  if (r > 0)
+    bl.push_back(bp);
+  object_info_t oi(bl);
+  *o = ghobject_t(oi.soid);
+  return 0;
+}
+
+
+int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
+			   long *handle, map<string, ghobject_t> *out)
+{
+  string to_list_path = get_full_path_subdir(to_list);
+  DIR *dir = ::opendir(to_list_path.c_str());
+  if (!dir) {
+    return -errno;
+  }
+
+  if (handle && *handle) {
+    seekdir(dir, *handle);
+  }
+
+  struct dirent *de = nullptr;
+  int r = 0;
+  int listed = 0;
+  bool end = true;
+  while ((de = ::readdir(dir))) {
+    end = false;
+    if (max_objs > 0 && listed >= max_objs) {
+      break;
+    }
+    if (de->d_name[0] == '.')
+      continue;
+    string short_name(de->d_name);
+    ghobject_t obj;
+    if (lfn_is_object(short_name)) {
+      r = lfn_translate(to_list, short_name, &obj);
+      if (r == -EINVAL) {
+	continue;
+      } else if (r < 0) {
+	goto cleanup;
+      } else {
+	string long_name = lfn_generate_object_name(obj);
+	if (!lfn_must_hash(long_name)) {
+	  ceph_assert(long_name == short_name);
+	}
+	if (index_version == HASH_INDEX_TAG)
+	  get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
+
+	out->insert(pair<string, ghobject_t>(short_name, obj));
+	++listed;
+      }
+    }
+  }
+
+  if (handle && !end) {
+    *handle = telldir(dir);
+  }
+
+  r = 0;
+ cleanup:
+  ::closedir(dir);
+  return r;
+}
+
+int LFNIndex::list_subdirs(const vector<string> &to_list,
+			   vector<string> *out)
+{
+  string to_list_path = get_full_path_subdir(to_list);
+  DIR *dir = ::opendir(to_list_path.c_str());
+  if (!dir)
+    return -errno;
+
+  struct dirent *de = nullptr;
+  while ((de = ::readdir(dir))) {
+    string short_name(de->d_name);
+    string demangled_name;
+    if (lfn_is_subdir(short_name, &demangled_name)) {
+      out->push_back(demangled_name);
+    }
+  }
+
+  ::closedir(dir);
+  return 0;
+}
+
+int LFNIndex::create_path(const vector<string> &to_create)
+{
+  maybe_inject_failure();
+  int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777);
+  maybe_inject_failure();
+  if (r < 0)
+    return -errno;
+  else
+    return 0;
+}
+
+int LFNIndex::remove_path(const vector<string> &to_remove)
+{
+  maybe_inject_failure();
+  int r = ::rmdir(get_full_path_subdir(to_remove).c_str());
+  maybe_inject_failure();
+  if (r < 0)
+    return -errno;
+  else
+    return 0;
+}
+
+int LFNIndex::path_exists(const vector<string> &to_check, int *exists)
+{
+  string full_path = get_full_path_subdir(to_check);
+  struct stat buf;
+  if (::stat(full_path.c_str(), &buf)) {
+    int r = -errno;
+    if (r == -ENOENT) {
+      *exists = 0;
+      return 0;
+    } else {
+      return r;
+    }
+  } else {
+    *exists = 1;
+    return 0;
+  }
+}
+
+int LFNIndex::add_attr_path(const vector<string> &path,
+			    const string &attr_name,
+			    bufferlist &attr_value)
+{
+  string full_path = get_full_path_subdir(path);
+  maybe_inject_failure();
+  return chain_setxattr<false, true>(
+    full_path.c_str(), mangle_attr_name(attr_name).c_str(),
+    reinterpret_cast<void *>(attr_value.c_str()),
+    attr_value.length());
+}
+
+int LFNIndex::get_attr_path(const vector<string> &path,
+			    const string &attr_name,
+			    bufferlist &attr_value)
+{
+  string full_path = get_full_path_subdir(path);
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    full_path.c_str(),
+    mangle_attr_name(attr_name).c_str(),
+    &bp);
+  if (r > 0)
+    attr_value.push_back(bp);
+  return r;
+}
+
+int LFNIndex::remove_attr_path(const vector<string> &path,
+			       const string &attr_name)
+{
+  string full_path = get_full_path_subdir(path);
+  string mangled_attr_name = mangle_attr_name(attr_name);
+  maybe_inject_failure();
+  return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
+}
+
+string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid)
+{
+  char s[FILENAME_MAX_LEN];
+  char *end = s + sizeof(s);
+  char *t = s;
+
+  ceph_assert(oid.generation == ghobject_t::NO_GEN);
+  const char *i = oid.hobj.oid.name.c_str();
+  // Escape subdir prefix
+  if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+    *t++ = '\\';
+    *t++ = 'd';
+    i += 4;
+  }
+  while (*i && t < end) {
+    if (*i == '\\') {
+      *t++ = '\\';
+      *t++ = '\\';
+    } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) {  // only escape leading .
+      *t++ = '\\';
+      *t++ = '.';
+    } else if (*i == '/') {
+      *t++ = '\\';
+      *t++ = 's';
+    } else
+      *t++ = *i;
+    i++;
+  }
+
+  if (oid.hobj.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "_head");
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "_snapdir");
+  else
+    t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
+  snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+
+  return string(s);
+}
+
+static void append_escaped(string::const_iterator begin,
+			   string::const_iterator end,
+			   string *out)
+{
+  for (string::const_iterator i = begin; i != end; ++i) {
+    if (*i == '\\') {
+      out->append("\\\\");
+    } else if (*i == '/') {
+      out->append("\\s");
+    } else if (*i == '_') {
+      out->append("\\u");
+    } else if (*i == '\0') {
+      out->append("\\n");
+    } else {
+      out->append(i, i+1);
+    }
+  }
+}
+
+string LFNIndex::lfn_generate_object_name_current(const ghobject_t &oid)
+{
+  string full_name;
+  string::const_iterator i = oid.hobj.oid.name.begin();
+  if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+    full_name.append("\\d");
+    i += 4;
+  } else if (oid.hobj.oid.name[0] == '.') {
+    full_name.append("\\.");
+    ++i;
+  }
+  append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+  full_name.append("_");
+  append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+  full_name.append("_");
+
+  char buf[PATH_MAX];
+  char *t = buf;
+  const char *end = t + sizeof(buf);
+  if (oid.hobj.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "head");
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "snapdir");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+  t += snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+  full_name.append(buf, t);
+  full_name.append("_");
+
+  append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name);
+  full_name.append("_");
+
+  t = buf;
+  if (oid.hobj.pool == -1)
+    t += snprintf(t, end - t, "none");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool);
+  full_name.append(buf, t);
+
+  if (oid.generation != ghobject_t::NO_GEN ||
+      oid.shard_id != shard_id_t::NO_SHARD) {
+    full_name.append("_");
+
+    t = buf;
+    t += snprintf(t, end - buf, "%llx", (long long unsigned)oid.generation);
+    full_name.append(buf, t);
+
+    full_name.append("_");
+
+    t = buf;
+    t += snprintf(t, end - buf, "%x", (int)oid.shard_id);
+    full_name.append(buf, t);
+  }
+
+  return full_name;
+}
+
+string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid)
+{
+  if (index_version == HASH_INDEX_TAG)
+    return lfn_generate_object_name_keyless(oid);
+
+  ceph_assert(oid.generation == ghobject_t::NO_GEN);
+  string full_name;
+  string::const_iterator i = oid.hobj.oid.name.begin();
+  if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+    full_name.append("\\d");
+    i += 4;
+  } else if (oid.hobj.oid.name[0] == '.') {
+    full_name.append("\\.");
+    ++i;
+  }
+  append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+  full_name.append("_");
+  append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+  full_name.append("_");
+
+  char snap_with_hash[PATH_MAX];
+  char *t = snap_with_hash;
+  char *end = t + sizeof(snap_with_hash);
+  if (oid.hobj.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "head");
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "snapdir");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+  snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+  full_name += string(snap_with_hash);
+  return full_name;
+}
+
+int LFNIndex::lfn_get_name(const vector<string> &path,
+			   const ghobject_t &oid,
+			   string *mangled_name, string *out_path,
+			   int *hardlink)
+{
+  string full_name = lfn_generate_object_name(oid);
+  int r;
+
+  if (!lfn_must_hash(full_name)) {
+    if (mangled_name)
+      *mangled_name = full_name;
+    if (out_path)
+      *out_path = get_full_path(path, full_name);
+    if (hardlink) {
+      struct stat buf;
+      string full_path = get_full_path(path, full_name);
+      maybe_inject_failure();
+      r = ::stat(full_path.c_str(), &buf);
+      if (r < 0) {
+	if (errno == ENOENT)
+	  *hardlink = 0;
+	else
+	  return -errno;
+      } else {
+	*hardlink = buf.st_nlink;
+      }
+    }
+    return 0;
+  }
+
+  int i = 0;
+  string candidate;
+  string candidate_path;
+  for ( ; ; ++i) {
+    candidate = lfn_get_short_name(oid, i);
+    candidate_path = get_full_path(path, candidate);
+    bufferptr bp;
+    r = chain_getxattr_buf(
+      candidate_path.c_str(),
+      get_lfn_attr().c_str(),
+      &bp);
+    if (r < 0) {
+      if (errno != ENODATA && errno != ENOENT)
+	return -errno;
+      if (errno == ENODATA) {
+	// Left over from incomplete transaction, it'll be replayed
+	maybe_inject_failure();
+	r = ::unlink(candidate_path.c_str());
+	maybe_inject_failure();
+	if (r < 0)
+	  return -errno;
+      }
+      if (mangled_name)
+	*mangled_name = candidate;
+      if (out_path)
+	*out_path = candidate_path;
+      if (hardlink)
+	*hardlink = 0;
+      return 0;
+    }
+    ceph_assert(r > 0);
+    string lfn(bp.c_str(), bp.length());
+    if (lfn == full_name) {
+      if (mangled_name)
+	*mangled_name = candidate;
+      if (out_path)
+	*out_path = candidate_path;
+      if (hardlink) {
+	struct stat st;
+	r = ::stat(candidate_path.c_str(), &st);
+        if (r < 0) {
+          if (errno == ENOENT)
+            *hardlink = 0;
+          else
+            return -errno;
+        } else {
+	  *hardlink = st.st_nlink;
+	}
+      }
+      return 0;
+    }
+    bp = bufferptr();
+    r = chain_getxattr_buf(
+      candidate_path.c_str(),
+      get_alt_lfn_attr().c_str(),
+      &bp);
+    if (r > 0) {
+      // only consider alt name if nlink > 1
+      struct stat st;
+      int rc = ::stat(candidate_path.c_str(), &st);
+      if (rc < 0)
+	return -errno;
+      if (st.st_nlink <= 1) {
+	// left over from incomplete unlink, remove
+	maybe_inject_failure();
+	dout(20) << __func__ << " found extra alt attr for " << candidate_path
+		 << ", long name " << string(bp.c_str(), bp.length()) << dendl;
+	rc = chain_removexattr(candidate_path.c_str(),
+			       get_alt_lfn_attr().c_str());
+	maybe_inject_failure();
+	if (rc < 0)
+	  return rc;
+	continue;
+      }
+      string lfn(bp.c_str(), bp.length());
+      if (lfn == full_name) {
+	dout(20) << __func__ << " used alt attr for " << full_name << dendl;
+	if (mangled_name)
+	  *mangled_name = candidate;
+	if (out_path)
+	  *out_path = candidate_path;
+	if (hardlink)
+	  *hardlink = st.st_nlink;
+	return 0;
+      }
+    }
+  }
+  ceph_abort(); // Unreachable
+  return 0;
+}
+
+int LFNIndex::lfn_created(const vector<string> &path,
+			  const ghobject_t &oid,
+			  const string &mangled_name)
+{
+  if (!lfn_is_hashed_filename(mangled_name))
+    return 0;
+  string full_path = get_full_path(path, mangled_name);
+  string full_name = lfn_generate_object_name(oid);
+  maybe_inject_failure();
+
+  // if the main attr exists and is different, move it to the alt attr.
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    full_path.c_str(),
+    get_lfn_attr().c_str(),
+    &bp);
+  if (r > 0) {
+    string lfn(bp.c_str(), bp.length());
+    if (lfn != full_name) {
+      dout(20) << __func__ << " " << mangled_name
+	       << " moving old name to alt attr "
+	       << lfn
+	       << ", new name is " << full_name << dendl;
+      r = chain_setxattr<false, true>(
+	full_path.c_str(), get_alt_lfn_attr().c_str(),
+	bp.c_str(), bp.length());
+      if (r < 0)
+	return r;
+    }
+  }
+
+  return chain_setxattr<false, true>(
+    full_path.c_str(), get_lfn_attr().c_str(),
+    full_name.c_str(), full_name.size());
+}
+
+int LFNIndex::lfn_unlink(const vector<string> &path,
+			 const ghobject_t &oid,
+			 const string &mangled_name)
+{
+  if (!lfn_is_hashed_filename(mangled_name)) {
+    string full_path = get_full_path(path, mangled_name);
+    maybe_inject_failure();
+    int r = ::unlink(full_path.c_str());
+    maybe_inject_failure();
+    if (r < 0)
+      return -errno;
+    return 0;
+  }
+
+  int i = 0;
+  for ( ; ; ++i) {
+    string candidate = lfn_get_short_name(oid, i);
+    if (candidate == mangled_name)
+      break;
+  }
+  int removed_index = i;
+  ++i;
+  for ( ; ; ++i) {
+    struct stat buf;
+    string to_check = lfn_get_short_name(oid, i);
+    string to_check_path = get_full_path(path, to_check);
+    int r = ::stat(to_check_path.c_str(), &buf);
+    if (r < 0) {
+      if (errno == ENOENT) {
+	break;
+      } else {
+	return -errno;
+      }
+    }
+  }
+  string full_path = get_full_path(path, mangled_name);
+  int fd = ::open(full_path.c_str(), O_RDONLY|O_CLOEXEC);
+  if (fd < 0)
+    return -errno;
+  FDCloser f(fd);
+  if (i == removed_index + 1) {
+    maybe_inject_failure();
+    int r = ::unlink(full_path.c_str());
+    maybe_inject_failure();
+    if (r < 0)
+      return -errno;
+  } else {
+    string& rename_to = full_path;
+    string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
+    maybe_inject_failure();
+    int r = ::rename(rename_from.c_str(), rename_to.c_str());
+    maybe_inject_failure();
+    if (r < 0)
+      return -errno;
+  }
+  struct stat st;
+  int r = ::fstat(fd, &st);
+  if (r == 0 && st.st_nlink > 0) {
+    // remove alt attr
+    dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
+    fsync_dir(path);
+    chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
+  }
+  return r;
+}
+
+int LFNIndex::lfn_translate(const vector<string> &path,
+			    const string &short_name,
+			    ghobject_t *out)
+{
+  if (!lfn_is_hashed_filename(short_name)) {
+    return lfn_parse_object_name(short_name, out);
+  }
+  string full_path = get_full_path(path, short_name);
+  // First, check alt attr
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    full_path.c_str(),
+    get_alt_lfn_attr().c_str(),
+    &bp);
+  if (r > 0) {
+    // There is an alt attr, does it match?
+    string lfn(bp.c_str(), bp.length());
+    if (short_name_matches(short_name.c_str(), lfn.c_str())) {
+      return lfn_parse_object_name(lfn, out);
+    }
+  }
+
+  // Get lfn_attr
+  bp = bufferptr();
+  r = chain_getxattr_buf(
+    full_path.c_str(),
+    get_lfn_attr().c_str(),
+    &bp);
+  if (r < 0)
+    return r;
+  if (r == 0)
+    return -EINVAL;
+
+  string long_name(bp.c_str(), bp.length());
+  return lfn_parse_object_name(long_name, out);
+}
+
+bool LFNIndex::lfn_is_object(const string &short_name)
+{
+  return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0);
+}
+
+bool LFNIndex::lfn_is_subdir(const string &name, string *demangled)
+{
+  if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) {
+    if (demangled)
+      *demangled = demangle_path_component(name);
+    return 1;
+  }
+  return 0;
+}
+
+static int parse_object(const char *s, ghobject_t& o)
+{
+  const char *hash = s + strlen(s) - 1;
+  while (*hash != '_' &&
+	 hash > s)
+    hash--;
+  const char *bar = hash - 1;
+  while (*bar != '_' &&
+	 bar > s)
+    bar--;
+  if (*bar == '_') {
+    char buf[bar-s + 1];
+    char *t = buf;
+    const char *i = s;
+    while (i < bar) {
+      if (*i == '\\') {
+	i++;
+	switch (*i) {
+	case '\\': *t++ = '\\'; break;
+	case '.': *t++ = '.'; break;
+	case 's': *t++ = '/'; break;
+	case 'd': {
+	  *t++ = 'D';
+	  *t++ = 'I';
+	  *t++ = 'R';
+	  *t++ = '_';
+	  break;
+	}
+	default: ceph_abort();
+	}
+      } else {
+	*t++ = *i;
+      }
+      i++;
+    }
+    *t = 0;
+    o.hobj.oid.name = string(buf, t-buf);
+    if (strncmp(bar+1, "head", 4) == 0)
+      o.hobj.snap = CEPH_NOSNAP;
+    else if (strncmp(bar+1, "snapdir", 7) == 0)
+      o.hobj.snap = CEPH_SNAPDIR;
+    else
+      o.hobj.snap = strtoull(bar+1, NULL, 16);
+
+    uint32_t hobject_hash_input;
+    sscanf(hash, "_%X", &hobject_hash_input);
+    o.hobj.set_hash(hobject_hash_input);
+
+    return 1;
+  }
+  return 0;
+}
+
+int LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
+{
+  int r = parse_object(long_name.c_str(), *out);
+  int64_t pool = -1;
+  spg_t pg;
+  if (coll().is_pg_prefix(&pg))
+    pool = (int64_t)pg.pgid.pool();
+  out->hobj.pool = pool;
+  if (!r) return -EINVAL;
+  string temp = lfn_generate_object_name(*out);
+  return 0;
+}
+
+static bool append_unescaped(string::const_iterator begin,
+			     string::const_iterator end,
+			     string *out)
+{
+  for (string::const_iterator i = begin; i != end; ++i) {
+    if (*i == '\\') {
+      ++i;
+      if (*i == '\\')
+	out->append("\\");
+      else if (*i == 's')
+	out->append("/");
+      else if (*i == 'n')
+	(*out) += '\0';
+      else if (*i == 'u')
+	out->append("_");
+      else
+	return false;
+    } else {
+      out->append(i, i+1);
+    }
+  }
+  return true;
+}
+
+int LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
+					     ghobject_t *out)
+{
+  string name;
+  string key;
+  uint32_t hash;
+  snapid_t snap;
+
+  string::const_iterator current = long_name.begin();
+  if (*current == '\\') {
+    ++current;
+    if (current == long_name.end()) {
+      return -EINVAL;
+    } else if (*current == 'd') {
+      name.append("DIR_");
+      ++current;
+    } else if (*current == '.') {
+      name.append(".");
+      ++current;
+    } else {
+      --current;
+    }
+  }
+
+  string::const_iterator end = current;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &name))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &key))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  string snap_str(current, end);
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end != long_name.end())
+    return -EINVAL;
+  string hash_str(current, end);
+
+  if (snap_str == "head")
+    snap = CEPH_NOSNAP;
+  else if (snap_str == "snapdir")
+    snap = CEPH_SNAPDIR;
+  else
+    snap = strtoull(snap_str.c_str(), NULL, 16);
+  sscanf(hash_str.c_str(), "%X", &hash);
+
+
+  int64_t pool = -1;
+  spg_t pg;
+  if (coll().is_pg_prefix(&pg))
+    pool = (int64_t)pg.pgid.pool();
+  (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
+  return 0;
+}
+
+
+int LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
+{
+  string name;
+  string key;
+  string ns;
+  uint32_t hash;
+  snapid_t snap;
+  uint64_t pool;
+  gen_t generation = ghobject_t::NO_GEN;
+  shard_id_t shard_id = shard_id_t::NO_SHARD;
+
+  if (index_version == HASH_INDEX_TAG)
+    return lfn_parse_object_name_keyless(long_name, out);
+  if (index_version == HASH_INDEX_TAG_2)
+    return lfn_parse_object_name_poolless(long_name, out);
+
+  string::const_iterator current = long_name.begin();
+  if (*current == '\\') {
+    ++current;
+    if (current == long_name.end()) {
+      return -EINVAL;
+    } else if (*current == 'd') {
+      name.append("DIR_");
+      ++current;
+    } else if (*current == '.') {
+      name.append(".");
+      ++current;
+    } else {
+      --current;
+    }
+  }
+
+  string::const_iterator end = current;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &name))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &key))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  string snap_str(current, end);
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  string hash_str(current, end);
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &ns))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  string pstring(current, end);
+
+  // Optional generation/shard_id
+  string genstring, shardstring;
+  if (end != long_name.end()) {
+    current = ++end;
+    for ( ; end != long_name.end() && *end != '_'; ++end) ;
+    if (end == long_name.end())
+      return -EINVAL;
+    genstring = string(current, end);
+
+    generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
+
+    current = ++end;
+    for ( ; end != long_name.end() && *end != '_'; ++end) ;
+    if (end != long_name.end())
+      return -EINVAL;
+    shardstring = string(current, end);
+
+    shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16);
+  }
+
+  if (snap_str == "head")
+    snap = CEPH_NOSNAP;
+  else if (snap_str == "snapdir")
+    snap = CEPH_SNAPDIR;
+  else
+    snap = strtoull(snap_str.c_str(), NULL, 16);
+  sscanf(hash_str.c_str(), "%X", &hash);
+
+  if (pstring == "none")
+    pool = (uint64_t)-1;
+  else
+    pool = strtoull(pstring.c_str(), NULL, 16);
+
+  (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
+  return 0;
+}
+
+bool LFNIndex::lfn_is_hashed_filename(const string &name)
+{
+  if (name.size() < (unsigned)FILENAME_SHORT_LEN) {
+    return 0;
+  }
+  if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size())
+      == FILENAME_COOKIE) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+bool LFNIndex::lfn_must_hash(const string &long_name)
+{
+  return (int)long_name.size() >= FILENAME_SHORT_LEN;
+}
+
+static inline void buf_to_hex(const unsigned char *buf, int len, char *str)
+{
+  int i;
+  str[0] = '\0';
+  for (i = 0; i < len; i++) {
+    sprintf(&str[i*2], "%02x", (int)buf[i]);
+  }
+}
+
+int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len)
+{
+  if (buf_len < FILENAME_HASH_LEN + 1)
+    return -EINVAL;
+
+  char buf[FILENAME_LFN_DIGEST_SIZE];
+  char hex[FILENAME_LFN_DIGEST_SIZE * 2];
+
+  SHA1 h;
+  h.Update((const unsigned char *)filename, strlen(filename));
+  h.Final((unsigned char *)buf);
+
+  buf_to_hex((unsigned char *)buf, (FILENAME_HASH_LEN + 1) / 2, hex);
+  strncpy(hash, hex, FILENAME_HASH_LEN);
+  hash[FILENAME_HASH_LEN] = '\0';
+  return 0;
+}
+
+void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len)
+{
+  char hash[FILENAME_HASH_LEN + 1];
+
+  ceph_assert(len >= FILENAME_SHORT_LEN + 4);
+
+  strncpy(filename, old_filename, FILENAME_PREFIX_LEN);
+  filename[FILENAME_PREFIX_LEN] = '\0';
+  if ((int)strlen(filename) < FILENAME_PREFIX_LEN)
+    return;
+  if (old_filename[FILENAME_PREFIX_LEN] == '\0')
+    return;
+
+  hash_filename(old_filename, hash, sizeof(hash));
+  int ofs = FILENAME_PREFIX_LEN;
+  while (1) {
+    int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str());
+    if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs)
+      break;
+    ofs--;
+  }
+}
+
+bool LFNIndex::short_name_matches(const char *short_name, const char *cand_long_name)
+{
+  const char *end = short_name;
+  while (*end) ++end;
+  const char *suffix = end;
+  if (suffix > short_name)  --suffix;                   // last char
+  while (suffix > short_name && *suffix != '_') --suffix; // back to first _
+  if (suffix > short_name) --suffix;                   // one behind that
+  while (suffix > short_name && *suffix != '_') --suffix; // back to second _
+
+  int index = -1;
+  char buf[FILENAME_SHORT_LEN + 4];
+  ceph_assert((end - suffix) < (int)sizeof(buf));
+  int r = sscanf(suffix, "_%d_%s", &index, buf);
+  if (r < 2)
+    return false;
+  if (strcmp(buf, FILENAME_COOKIE.c_str()) != 0)
+    return false;
+  build_filename(cand_long_name, index, buf, sizeof(buf));
+  return strcmp(short_name, buf) == 0;
+}
+
+string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
+{
+  string long_name = lfn_generate_object_name(oid);
+  ceph_assert(lfn_must_hash(long_name));
+  char buf[FILENAME_SHORT_LEN + 4];
+  build_filename(long_name.c_str(), i, buf, sizeof(buf));
+  return string(buf);
+}
+
+const string &LFNIndex::get_base_path()
+{
+  return base_path;
+}
+
+string LFNIndex::get_full_path_subdir(const vector<string> &rel)
+{
+  string retval = get_base_path();
+  for (vector<string>::const_iterator i = rel.begin();
+       i != rel.end();
+       ++i) {
+    retval += "/";
+    retval += mangle_path_component(*i);
+  }
+  return retval;
+}
+
+string LFNIndex::get_full_path(const vector<string> &rel, const string &name)
+{
+  return get_full_path_subdir(rel) + "/" + name;
+}
+
+string LFNIndex::mangle_path_component(const string &component)
+{
+  return SUBDIR_PREFIX + component;
+}
+
+string LFNIndex::demangle_path_component(const string &component)
+{
+  return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size());
+}
+
+int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
+				  ghobject_t *oid, string *shortname)
+{
+  const char *beginning = in + get_base_path().size();
+  const char *end = beginning;
+  while (1) {
+    end++;
+    beginning = end++;
+    for ( ; *end != '\0' && *end != '/'; ++end) ;
+    if (*end != '\0') {
+      out->push_back(demangle_path_component(string(beginning, end - beginning)));
+      continue;
+    } else {
+      break;
+    }
+  }
+  *shortname = string(beginning, end - beginning);
+  if (oid) {
+    int r = lfn_translate(*out, *shortname, oid);
+    if (r < 0)
+      return r;
+  }
+  return 0;
+}
+
+string LFNIndex::mangle_attr_name(const string &attr)
+{
+  return PHASH_ATTR_PREFIX + attr;
+}
diff --git a/src/os/filestore/LFNIndex.h b/src/os/filestore/LFNIndex.h
new file mode 100644
index 00000000..149ed10f
--- /dev/null
+++ b/src/os/filestore/LFNIndex.h
@@ -0,0 +1,614 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef OS_LFNINDEX_H
+#define OS_LFNINDEX_H
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <exception>
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/ceph_crypto.h"
+
+#include "CollectionIndex.h"
+
+/**
+ * LFNIndex also encapsulates logic for manipulating
+ * subdirectories of a collection as well as the long filename
+ * logic.
+ *
+ * The protected methods provide machinery for derived classes to
+ * manipulate subdirectories and objects.
+ *
+ * The virtual methods are to be overridden to provide the actual
+ * hashed layout.
+ *
+ * User must call created when an object is created.
+ *
+ * Synchronization: Calling code must ensure that there are no object
+ * creations or deletions during the lifetime of a Path object (except
+ * of an object at that path).
+ *
+ * Unless otherwise noted, methods which return an int return 0 on success
+ * and a negative error code on failure.
+ */
+#define WRAP_RETRY(x) {				\
+  bool failed = false;				\
+  int r = 0;					\
+  init_inject_failure();			\
+  while (1) {					\
+    try {					\
+      if (failed) {				\
+	r = cleanup();				\
+	ceph_assert(r == 0);				\
+      }						\
+      { x }					\
+      out:					\
+      complete_inject_failure();		\
+      return r;					\
+    } catch (RetryException&) {			\
+      failed = true;				\
+    } catch (...) {				\
+      ceph_abort();				\
+    }						\
+  }						\
+  return -1;					\
+  }						\
+
+
+
+class LFNIndex : public CollectionIndex {
+  /// Hash digest output size.
+  static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE;
+  /// Length of filename hash.
+  static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE;
+  /// Max filename size.
+  static const int FILENAME_MAX_LEN = 4096;
+  /// Length of hashed filename.
+  static const int FILENAME_SHORT_LEN = 255;
+  /// Length of hashed filename prefix.
+  static const int FILENAME_PREFIX_LEN;
+  /// Length of hashed filename cookie.
+  static const int FILENAME_EXTRA = 4;
+  /// Lfn cookie value.
+  static const string FILENAME_COOKIE;
+  /// Name of LFN attribute for storing full name.
+  static const string LFN_ATTR;
+  /// Prefix for subdir index attributes.
+  static const string PHASH_ATTR_PREFIX;
+  /// Prefix for index subdirectories.
+  static const string SUBDIR_PREFIX;
+
+  /// Path to Index base.
+  const string base_path;
+
+protected:
+  const uint32_t index_version;
+
+  /// true if retry injection is enabled
+  struct RetryException : public exception {};
+  bool error_injection_enabled;
+  bool error_injection_on;
+  double error_injection_probability;
+  uint64_t last_failure;
+  uint64_t current_failure;
+  void init_inject_failure() {
+    if (error_injection_on) {
+      error_injection_enabled = true;
+      last_failure = current_failure = 0;
+    }
+  }
+  void maybe_inject_failure();
+  void complete_inject_failure() {
+    error_injection_enabled = false;
+  }
+
+private:
+  string lfn_attribute, lfn_alt_attribute;
+  coll_t collection;
+
+public:
+  /// Constructor
+  LFNIndex(
+    CephContext* cct,
+    coll_t collection,
+    const char *base_path, ///< [in] path to Index root
+    uint32_t index_version,
+    double _error_injection_probability=0)
+    : CollectionIndex(cct, collection),
+      base_path(base_path),
+      index_version(index_version),
+      error_injection_enabled(false),
+      error_injection_on(_error_injection_probability != 0),
+      error_injection_probability(_error_injection_probability),
+      last_failure(0), current_failure(0),
+      collection(collection) {
+    if (index_version == HASH_INDEX_TAG) {
+      lfn_attribute = LFN_ATTR;
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%d", index_version);
+      lfn_attribute = LFN_ATTR + string(buf);
+      lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
+   }
+  }
+
+  coll_t coll() const override { return collection; }
+
+  /// Virtual destructor
+  ~LFNIndex() override {}
+
+  /// @see CollectionIndex
+  int init() override;
+
+  /// @see CollectionIndex
+  int cleanup() override = 0;
+
+  /// @see CollectionIndex
+  int created(
+    const ghobject_t &oid,
+    const char *path
+    ) override;
+
+  /// @see CollectionIndex
+  int unlink(
+    const ghobject_t &oid
+    ) override;
+
+  /// @see CollectionIndex
+  int lookup(
+    const ghobject_t &oid,
+    IndexedPath *path,
+    int *hardlink
+    ) override;
+
+  /// @see CollectionIndex;
+  int pre_hash_collection(
+      uint32_t pg_num,
+      uint64_t expected_num_objs
+      ) override;
+
+  /// @see CollectionIndex
+  int collection_list_partial(
+    const ghobject_t &start,
+    const ghobject_t &end,
+    int max_count,
+    vector<ghobject_t> *ls,
+    ghobject_t *next
+    ) override;
+
+  virtual int _split(
+    uint32_t match,                             //< [in] value to match
+    uint32_t bits,                              //< [in] bits to check
+    CollectionIndex* dest                       //< [in] destination index
+    ) = 0;
+  virtual int _merge(
+    uint32_t bits,                              //< [in] bits for target
+    CollectionIndex* dest                       //< [in] destination index
+    ) = 0;
+
+  /// @see CollectionIndex
+  int split(
+    uint32_t match,
+    uint32_t bits,
+    CollectionIndex* dest
+    ) override {
+    WRAP_RETRY(
+      r = _split(match, bits, dest);
+      goto out;
+      );
+  }
+
+  /// @see CollectionIndex
+  int merge(
+    uint32_t bits,
+    CollectionIndex* dest
+    ) override {
+    WRAP_RETRY(
+      r = _merge(bits, dest);
+      goto out;
+      );
+  }
+
+  /**
+   * Returns the length of the longest escaped name which could result
+   * from any clone, shard, or rollback object of this object
+   */
+  static uint64_t get_max_escaped_name_len(const hobject_t &obj);
+
+protected:
+  virtual int _init() = 0;
+
+  /// Will be called upon object creation
+  virtual int _created(
+    const vector<string> &path, ///< [in] Path to subdir.
+    const ghobject_t &oid,      ///< [in] Object created.
+    const string &mangled_name  ///< [in] Mangled filename.
+    ) = 0;
+
+  /// Will be called to remove an object
+  virtual int _remove(
+    const vector<string> &path,     ///< [in] Path to subdir.
+    const ghobject_t &oid,          ///< [in] Object to remove.
+    const string &mangled_name	    ///< [in] Mangled filename.
+    ) = 0;
+
+  /// Return the path and mangled_name for oid.
+  virtual int _lookup(
+    const ghobject_t &oid,///< [in] Object for lookup.
+    vector<string> *path, ///< [out] Path to the object.
+    string *mangled_name, ///< [out] Mangled filename.
+    int *exists		  ///< [out] True if the object exists.
+    ) = 0;
+
+  /// Pre-hash the collection with the given pg number and
+  /// expected number of objects in the collection.
+  virtual int _pre_hash_collection(
+      uint32_t pg_num,
+      uint64_t expected_num_objs
+      ) = 0;
+
+  /// @see CollectionIndex
+  virtual int _collection_list_partial(
+    const ghobject_t &start,
+    const ghobject_t &end,
+    int max_count,
+    vector<ghobject_t> *ls,
+    ghobject_t *next
+    ) = 0;
+
+protected:
+
+  /* Non-virtual utility methods */
+
+  /// Sync a subdirectory
+  int fsync_dir(
+    const vector<string> &path ///< [in] Path to sync
+    ); ///< @return Error Code, 0 on success
+
+  /// Link an object from from into to
+  int link_object(
+    const vector<string> &from,   ///< [in] Source subdirectory.
+    const vector<string> &to,     ///< [in] Dest subdirectory.
+    const ghobject_t &oid,        ///< [in] Object to move.
+    const string &from_short_name ///< [in] Mangled filename of oid.
+    ); ///< @return Error Code, 0 on success
+
+  /**
+   * Efficiently remove objects from a subdirectory
+   *
+   * remove_object invalidates mangled names in the directory requiring
+   * the mangled name of each additional object to be looked up a second
+   * time.  remove_objects removes the need for additional lookups
+   *
+   * @param [in] dir Directory from which to remove.
+   * @param [in] map of objects to remove to mangle names
+   * @param [in,out] map of filenames to objects
+   * @return Error Code, 0 on success.
+   */
+  int remove_objects(
+    const vector<string> &dir,
+    const map<string, ghobject_t> &to_remove,
+    map<string, ghobject_t> *remaining
+    );
+
+
+  /**
+   * Moves contents of from into to.
+   *
+   * Invalidates mangled names in to.  If interrupted, all objects will be
+   * present in to before objects are removed from from.  Ignores EEXIST
+   * while linking into to.
+   * @return Error Code, 0 on success
+   */
+  int move_objects(
+    const vector<string> &from, ///< [in] Source subdirectory.
+    const vector<string> &to    ///< [in] Dest subdirectory.
+    );
+
+  /**
+   * Remove an object from from.
+   *
+   * Invalidates mangled names in from.
+   * @return Error Code, 0 on success
+   */
+  int remove_object(
+    const vector<string> &from,  ///< [in] Directory from which to remove.
+    const ghobject_t &to_remove   ///< [in] Object to remove.
+    );
+
+  /**
+   * Gets the filename corresponding to oid in from.
+   *
+   * The filename may differ between subdirectories.  Furthermore,
+   * file creations ore removals in from may invalidate the name.
+   * @return Error code on failure, 0 on success
+   */
+  int get_mangled_name(
+    const vector<string> &from, ///< [in] Subdirectory
+    const ghobject_t &oid,	///< [in] Object
+    string *mangled_name,	///< [out] Filename
+    int *hardlink		///< [out] hardlink for this file, hardlink=0 mean no-exist
+    );
+
+  /// do move subdir from from to dest
+  static int move_subdir(
+    LFNIndex &from,             ///< [in] from index
+    LFNIndex &dest,             ///< [in] to index
+    const vector<string> &path, ///< [in] path containing dir
+    string dir                  ///< [in] dir to move
+    );
+
+  /// do move object from from to dest
+  static int move_object(
+    LFNIndex &from,             ///< [in] from index
+    LFNIndex &dest,             ///< [in] to index
+    const vector<string> &path, ///< [in] path to split
+    const pair<string, ghobject_t> &obj ///< [in] obj to move
+    );
+
+  /**
+   * Lists objects in to_list.
+   *
+   * @param [in] to_list Directory to list.
+   * @param [in] max_objects Max number to list.
+   * @param [in,out] handle Cookie for continuing the listing.
+   * Initialize to zero to start at the beginning of the directory.
+   * @param [out] out Mapping of listed object filenames to objects.
+   * @return Error code on failure, 0 on success
+   */
+  int list_objects(
+    const vector<string> &to_list,
+    int max_objects,
+    long *handle,
+    map<string, ghobject_t> *out
+    );
+
+  /// Lists subdirectories.
+  int list_subdirs(
+    const vector<string> &to_list, ///< [in] Directory to list.
+    vector<string> *out		   ///< [out] Subdirectories listed.
+    );
+
+  /// Create subdirectory.
+  int create_path(
+    const vector<string> &to_create ///< [in] Subdirectory to create.
+    );
+
+  /// Remove subdirectory.
+  int remove_path(
+    const vector<string> &to_remove ///< [in] Subdirectory to remove.
+    );
+
+  /// Check whether to_check exists.
+  int path_exists(
+    const vector<string> &to_check, ///< [in] Subdirectory to check.
+    int *exists			    ///< [out] 1 if it exists, 0 else
+    );
+
+  /// Save attr_value to attr_name attribute on path.
+  int add_attr_path(
+    const vector<string> &path, ///< [in] Path to modify.
+    const string &attr_name, 	///< [in] Name of attribute.
+    bufferlist &attr_value	///< [in] Value to save.
+    );
+
+  /// Read into attr_value attribute attr_name on path.
+  int get_attr_path(
+    const vector<string> &path, ///< [in] Path to read.
+    const string &attr_name, 	///< [in] Attribute to read.
+    bufferlist &attr_value	///< [out] Attribute value read.
+    );
+
+  /// Remove attr from path
+  int remove_attr_path(
+    const vector<string> &path, ///< [in] path from which to remove attr
+    const string &attr_name	///< [in] attr to remove
+    ); ///< @return Error code, 0 on success
+
+private:
+  /* lfn translation functions */
+
+  /**
+   * Gets the version specific lfn attribute tag
+   */
+  const string &get_lfn_attr() const {
+    return lfn_attribute;
+  }
+  const string &get_alt_lfn_attr() const {
+    return lfn_alt_attribute;
+  }
+
+  /**
+   * Gets the filename corresponding to oid in path.
+   *
+   * @param [in] path Path in which to get filename for oid.
+   * @param [in] oid Object for which to get filename.
+   * @param [out] mangled_name Filename for oid, pass NULL if not needed.
+   * @param [out] full_path Fullpath for oid, pass NULL if not needed.
+   * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if
+   * not needed
+   * @return Error Code, 0 on success.
+   */
+  int lfn_get_name(
+    const vector<string> &path,
+    const ghobject_t &oid,
+    string *mangled_name,
+    string *full_path,
+    int *hardlink
+    );
+
+  /// Adjusts path contents when oid is created at name mangled_name.
+  int lfn_created(
+    const vector<string> &path, ///< [in] Path to adjust.
+    const ghobject_t &oid,	///< [in] Object created.
+    const string &mangled_name  ///< [in] Filename of created object.
+    );
+
+  /// Removes oid from path while adjusting path contents
+  int lfn_unlink(
+    const vector<string> &path, ///< [in] Path containing oid.
+    const ghobject_t &oid,	///< [in] Object to remove.
+    const string &mangled_name	///< [in] Filename of object to remove.
+    );
+
+  ///Transate a file into and ghobject_t.
+  int lfn_translate(
+    const vector<string> &path, ///< [in] Path containing the file.
+    const string &short_name,	///< [in] Filename to translate.
+    ghobject_t *out		///< [out] Object found.
+    ); ///< @return Negative error code on error, 0 if not an object, 1 else
+
+  /* manglers/demanglers */
+  /// Filters object filenames
+  bool lfn_is_object(
+    const string &short_name ///< [in] Filename to check
+    ); ///< True if short_name is an object, false otherwise
+
+  /// Filters subdir filenames
+  bool lfn_is_subdir(
+    const string &short_name, ///< [in] Filename to check.
+    string *demangled_name    ///< [out] Demangled subdir name.
+    ); ///< @return True if short_name is a subdir, false otherwise
+
+  /// Generate object name
+  string lfn_generate_object_name_keyless(
+    const ghobject_t &oid ///< [in] Object for which to generate.
+    ); ///< @return Generated object name.
+
+  /// Generate object name
+  string lfn_generate_object_name_poolless(
+    const ghobject_t &oid ///< [in] Object for which to generate.
+    ); ///< @return Generated object name.
+
+  /// Generate object name
+  static string lfn_generate_object_name_current(
+    const ghobject_t &oid ///< [in] Object for which to generate.
+    ); ///< @return Generated object name.
+
+  /// Generate object name
+  string lfn_generate_object_name(
+    const ghobject_t &oid ///< [in] Object for which to generate.
+    ) {
+    if (index_version == HASH_INDEX_TAG)
+      return lfn_generate_object_name_keyless(oid);
+    if (index_version == HASH_INDEX_TAG_2)
+      return lfn_generate_object_name_poolless(oid);
+    else
+      return lfn_generate_object_name_current(oid);
+  } ///< @return Generated object name.
+
+  /// Parse object name
+  int lfn_parse_object_name_keyless(
+    const string &long_name, ///< [in] Name to parse
+    ghobject_t *out	     ///< [out] Resulting Object
+    ); ///< @return True if successful, False otherwise.
+
+  /// Parse object name
+  int lfn_parse_object_name_poolless(
+    const string &long_name, ///< [in] Name to parse
+    ghobject_t *out	     ///< [out] Resulting Object
+    ); ///< @return True if successful, False otherwise.
+
+  /// Parse object name
+  int lfn_parse_object_name(
+    const string &long_name, ///< [in] Name to parse
+    ghobject_t *out	     ///< [out] Resulting Object
+    ); ///< @return True if successful, False otherwise.
+
+  /// Checks whether short_name is a hashed filename.
+  bool lfn_is_hashed_filename(
+    const string &short_name ///< [in] Name to check.
+    ); ///< @return True if short_name is hashed, False otherwise.
+
+  /// Checks whether long_name must be hashed.
+  bool lfn_must_hash(
+    const string &long_name ///< [in] Name to check.
+    ); ///< @return True if long_name must be hashed, False otherwise.
+
+  /// Generate hashed name.
+  string lfn_get_short_name(
+    const ghobject_t &oid, ///< [in] Object for which to generate.
+    int i		   ///< [in] Index of hashed name to generate.
+    ); ///< @return Hashed filename.
+
+  /* other common methods */
+  /// Gets the base path
+  const string &get_base_path(); ///< @return Index base_path
+
+  /// Get full path the subdir
+  string get_full_path_subdir(
+    const vector<string> &rel ///< [in] The subdir.
+    ); ///< @return Full path to rel.
+
+  /// Get full path to object
+  string get_full_path(
+    const vector<string> &rel, ///< [in] Path to object.
+    const string &name	       ///< [in] Filename of object.
+    ); ///< @return Fullpath to object at name in rel.
+
+  /// Get mangled path component
+  string mangle_path_component(
+    const string &component ///< [in] Component to mangle
+    ); /// @return Mangled component
+
+  /// Demangle component
+  string demangle_path_component(
+    const string &component ///< [in] Subdir name to demangle
+    ); ///< @return Demangled path component.
+
+  /// Decompose full path into object name and filename.
+  int decompose_full_path(
+    const char *in,      ///< [in] Full path to object.
+    vector<string> *out, ///< [out] Path to object at in.
+    ghobject_t *oid,	 ///< [out] Object at in.
+    string *shortname	 ///< [out] Filename of object at in.
+    ); ///< @return Error Code, 0 on success.
+
+  /// Mangle attribute name
+  string mangle_attr_name(
+    const string &attr ///< [in] Attribute to mangle.
+    ); ///< @return Mangled attribute name.
+
+  /// checks whether long_name could hash to short_name
+  bool short_name_matches(
+    const char *short_name,    ///< [in] name to check against
+    const char *cand_long_name ///< [in] candidate long name
+    );
+
+  /// Builds hashed filename
+  void build_filename(
+    const char *old_filename, ///< [in] Filename to convert.
+    int i,		      ///< [in] Index of hash.
+    char *filename,	      ///< [out] Resulting filename.
+    int len		      ///< [in] Size of buffer for filename
+    ); ///< @return Error Code, 0 on success
+
+  /// Get hash of filename
+  int hash_filename(
+    const char *filename, ///< [in] Filename to hash.
+    char *hash,		  ///< [out] Hash of filename.
+    int len		  ///< [in] Size of hash buffer.
+    ); ///< @return Error Code, 0 on success.
+
+  friend class TestWrapLFNIndex;
+};
+typedef LFNIndex::IndexedPath IndexedPath;
+
+#endif
diff --git a/src/os/filestore/SequencerPosition.h b/src/os/filestore/SequencerPosition.h
new file mode 100644
index 00000000..164112ee
--- /dev/null
+++ b/src/os/filestore/SequencerPosition.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OS_SEQUENCERPOSITION_H
+#define __CEPH_OS_SEQUENCERPOSITION_H
+
+#include "include/types.h"
+#include "include/cmp.h"
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+#include <ostream>
+
+/**
+ * transaction and op offset
+ */
+struct SequencerPosition {
+  uint64_t seq;  ///< seq
+  uint32_t trans; ///< transaction in that seq (0-based)
+  uint32_t op;    ///< op in that transaction (0-based)
+
+  SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(seq, bl);
+    encode(trans, bl);
+    encode(op, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(seq, p);
+    decode(trans, p);
+    decode(op, p);
+    DECODE_FINISH(p);
+  }
+  void dump(Formatter *f) const {
+    f->dump_unsigned("seq", seq);
+    f->dump_unsigned("trans", trans);
+    f->dump_unsigned("op", op);
+  }
+  static void generate_test_instances(list<SequencerPosition*>& o) {
+    o.push_back(new SequencerPosition);
+    o.push_back(new SequencerPosition(1, 2, 3));
+    o.push_back(new SequencerPosition(4, 5, 6));
+  }
+};
+WRITE_CLASS_ENCODER(SequencerPosition)
+
+inline ostream& operator<<(ostream& out, const SequencerPosition& t) {
+  return out << t.seq << "." << t.trans << "." << t.op;
+}
+
+WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op)
+WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op)
+
+
+#endif
diff --git a/src/os/filestore/WBThrottle.cc b/src/os/filestore/WBThrottle.cc
new file mode 100644
index 00000000..ba2ed131
--- /dev/null
+++ b/src/os/filestore/WBThrottle.cc
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+
+#include "os/filestore/WBThrottle.h"
+#include "common/perf_counters.h"
+#include "common/errno.h"
+
+WBThrottle::WBThrottle(CephContext *cct) :
+  cur_ios(0), cur_size(0),
+  cct(cct),
+  logger(NULL),
+  stopping(true),
+  lock("WBThrottle::lock", false, true, false),
+  fs(XFS)
+{
+  {
+    Mutex::Locker l(lock);
+    set_from_conf();
+  }
+  ceph_assert(cct);
+  PerfCountersBuilder b(
+    cct, string("WBThrottle"),
+    l_wbthrottle_first, l_wbthrottle_last);
+  b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data", NULL, 0, unit_t(UNIT_BYTES));
+  b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data", NULL, 0, unit_t(UNIT_BYTES));
+  b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations");
+  b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations");
+  b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write");
+  b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries");
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+  for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i)
+    logger->set(i, 0);
+
+  cct->_conf.add_observer(this);
+}
+
+WBThrottle::~WBThrottle() {
+  ceph_assert(cct);
+  cct->get_perfcounters_collection()->remove(logger);
+  delete logger;
+  cct->_conf.remove_observer(this);
+}
+
+void WBThrottle::start()
+{
+  {
+    Mutex::Locker l(lock);
+    stopping = false;
+  }
+  create("wb_throttle");
+}
+
+void WBThrottle::stop()
+{
+  {
+    Mutex::Locker l(lock);
+    stopping = true;
+    cond.Signal();
+  }
+
+  join();
+}
+
+const char** WBThrottle::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "filestore_wbthrottle_btrfs_bytes_start_flusher",
+    "filestore_wbthrottle_btrfs_bytes_hard_limit",
+    "filestore_wbthrottle_btrfs_ios_start_flusher",
+    "filestore_wbthrottle_btrfs_ios_hard_limit",
+    "filestore_wbthrottle_btrfs_inodes_start_flusher",
+    "filestore_wbthrottle_btrfs_inodes_hard_limit",
+    "filestore_wbthrottle_xfs_bytes_start_flusher",
+    "filestore_wbthrottle_xfs_bytes_hard_limit",
+    "filestore_wbthrottle_xfs_ios_start_flusher",
+    "filestore_wbthrottle_xfs_ios_hard_limit",
+    "filestore_wbthrottle_xfs_inodes_start_flusher",
+    "filestore_wbthrottle_xfs_inodes_hard_limit",
+    NULL
+  };
+  return KEYS;
+}
+
+void WBThrottle::set_from_conf()
+{
+  ceph_assert(lock.is_locked());
+  if (fs == BTRFS) {
+    size_limits.first =
+      cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher;
+    size_limits.second =
+      cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit;
+    io_limits.first =
+      cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher;
+    io_limits.second =
+      cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit;
+    fd_limits.first =
+      cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher;
+    fd_limits.second =
+      cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit;
+  } else if (fs == XFS) {
+    size_limits.first =
+      cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher;
+    size_limits.second =
+      cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit;
+    io_limits.first =
+      cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher;
+    io_limits.second =
+      cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit;
+    fd_limits.first =
+      cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher;
+    fd_limits.second =
+      cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit;
+  } else {
+    ceph_abort_msg("invalid value for fs");
+  }
+  cond.Signal();
+}
+
+void WBThrottle::handle_conf_change(const ConfigProxy& conf,
+				    const std::set<std::string> &changed)
+{
+  Mutex::Locker l(lock);
+  for (const char** i = get_tracked_conf_keys(); *i; ++i) {
+    if (changed.count(*i)) {
+      set_from_conf();
+      return;
+    }
+  }
+}
+
+bool WBThrottle::get_next_should_flush(
+  boost::tuple<ghobject_t, FDRef, PendingWB> *next)
+{
+  ceph_assert(lock.is_locked());
+  ceph_assert(next);
+  while (!stopping && (!beyond_limit() || pending_wbs.empty()))
+         cond.Wait(lock);
+  if (stopping)
+    return false;
+  ceph_assert(!pending_wbs.empty());
+  ghobject_t obj(pop_object());
+
+  ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+    pending_wbs.find(obj);
+  *next = boost::make_tuple(obj, i->second.second, i->second.first);
+  pending_wbs.erase(i);
+  return true;
+}
+
+
+void *WBThrottle::entry()
+{
+  Mutex::Locker l(lock);
+  boost::tuple<ghobject_t, FDRef, PendingWB> wb;
+  while (get_next_should_flush(&wb)) {
+    clearing = wb.get<0>();
+    cur_ios -= wb.get<2>().ios;
+    logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
+    logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios);
+    cur_size -= wb.get<2>().size;
+    logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
+    logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size);
+    logger->dec(l_wbthrottle_inodes_dirtied);
+    logger->inc(l_wbthrottle_inodes_wb);
+    lock.Unlock();
+#if defined(HAVE_FDATASYNC)
+    int r = ::fdatasync(**wb.get<1>());
+#else
+    int r = ::fsync(**wb.get<1>());
+#endif
+    if (r < 0) {
+      lderr(cct) << "WBThrottle fsync failed: " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+#ifdef HAVE_POSIX_FADVISE
+    if (cct->_conf->filestore_fadvise && wb.get<2>().nocache) {
+      int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
+      ceph_assert(fa_r == 0);
+    }
+#endif
+    lock.Lock();
+    clearing = ghobject_t();
+    cond.Signal();
+    wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
+  }
+  return 0;
+}
+
+void WBThrottle::queue_wb(
+  FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len,
+  bool nocache)
+{
+  Mutex::Locker l(lock);
+  ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
+    pending_wbs.find(hoid);
+  if (wbiter == pending_wbs.end()) {
+    wbiter = pending_wbs.insert(
+      make_pair(hoid,
+	make_pair(
+	  PendingWB(),
+	  fd))).first;
+    logger->inc(l_wbthrottle_inodes_dirtied);
+  } else {
+    remove_object(hoid);
+  }
+
+  cur_ios++;
+  logger->inc(l_wbthrottle_ios_dirtied);
+  cur_size += len;
+  logger->inc(l_wbthrottle_bytes_dirtied, len);
+
+  wbiter->second.first.add(nocache, len, 1);
+  insert_object(hoid);
+  if (beyond_limit())
+    cond.Signal();
+}
+
+void WBThrottle::clear()
+{
+  Mutex::Locker l(lock);
+  for (ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+	 pending_wbs.begin();
+       i != pending_wbs.end();
+       ++i) {
+#ifdef HAVE_POSIX_FADVISE
+    if (cct->_conf->filestore_fadvise && i->second.first.nocache) {
+      int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED);
+      ceph_assert(fa_r == 0);
+    }
+#endif
+
+  }
+  cur_ios = cur_size = 0;
+  logger->set(l_wbthrottle_ios_dirtied, 0);
+  logger->set(l_wbthrottle_bytes_dirtied, 0);
+  logger->set(l_wbthrottle_inodes_dirtied, 0);
+  pending_wbs.clear();
+  lru.clear();
+  rev_lru.clear();
+  cond.Signal();
+}
+
+void WBThrottle::clear_object(const ghobject_t &hoid)
+{
+  Mutex::Locker l(lock);
+  while (clearing == hoid)
+    cond.Wait(lock);
+  ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+    pending_wbs.find(hoid);
+  if (i == pending_wbs.end())
+    return;
+
+  cur_ios -= i->second.first.ios;
+  logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios);
+  cur_size -= i->second.first.size;
+  logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size);
+  logger->dec(l_wbthrottle_inodes_dirtied);
+
+  pending_wbs.erase(i);
+  remove_object(hoid);
+  cond.Signal();
+}
+
+void WBThrottle::throttle()
+{
+  Mutex::Locker l(lock);
+  while (!stopping && need_flush())
+    cond.Wait(lock);
+}
diff --git a/src/os/filestore/WBThrottle.h b/src/os/filestore/WBThrottle.h
new file mode 100644
index 00000000..ef809ea4
--- /dev/null
+++ b/src/os/filestore/WBThrottle.h
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef WBTHROTTLE_H
+#define WBTHROTTLE_H
+
+#include "include/unordered_map.h"
+#include <boost/tuple/tuple.hpp>
+#include "common/Formatter.h"
+#include "common/hobject.h"
+#include "include/interval_set.h"
+#include "FDCache.h"
+#include "common/Thread.h"
+#include "common/ceph_context.h"
+
+class PerfCounters;
+enum {
+  l_wbthrottle_first = 999090,
+  l_wbthrottle_bytes_dirtied,
+  l_wbthrottle_bytes_wb,
+  l_wbthrottle_ios_dirtied,
+  l_wbthrottle_ios_wb,
+  l_wbthrottle_inodes_dirtied,
+  l_wbthrottle_inodes_wb,
+  l_wbthrottle_last
+};
+
+/**
+ * WBThrottle
+ *
+ * Tracks, throttles, and flushes outstanding IO
+ */
+class WBThrottle : Thread, public md_config_obs_t {
+  ghobject_t clearing;
+  /* *_limits.first is the start_flusher limit and
+   * *_limits.second is the hard limit
+   */
+
+  /// Limits on unflushed bytes
+  pair<uint64_t, uint64_t> size_limits;
+
+  /// Limits on unflushed ios
+  pair<uint64_t, uint64_t> io_limits;
+
+  /// Limits on unflushed objects
+  pair<uint64_t, uint64_t> fd_limits;
+
+  uint64_t cur_ios;  /// Currently unflushed IOs
+  uint64_t cur_size; /// Currently unflushed bytes
+
+  /**
+   * PendingWB tracks the ios pending on an object.
+   */
+  class PendingWB {
+  public:
+    bool nocache;
+    uint64_t size;
+    uint64_t ios;
+    PendingWB() : nocache(true), size(0), ios(0) {}
+    void add(bool _nocache, uint64_t _size, uint64_t _ios) {
+      if (!_nocache)
+	nocache = false; // only nocache if all writes are nocache
+      size += _size;
+      ios += _ios;
+    }
+  };
+
+  CephContext *cct;
+  PerfCounters *logger;
+  bool stopping;
+  Mutex lock;
+  Cond cond;
+
+
+  /**
+   * Flush objects in lru order
+   */
+  list<ghobject_t> lru;
+  ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator> rev_lru;
+  void remove_object(const ghobject_t &oid) {
+    ceph_assert(lock.is_locked());
+    ceph::unordered_map<ghobject_t, list<ghobject_t>::iterator>::iterator iter =
+      rev_lru.find(oid);
+    if (iter == rev_lru.end())
+      return;
+
+    lru.erase(iter->second);
+    rev_lru.erase(iter);
+  }
+  ghobject_t pop_object() {
+    ceph_assert(!lru.empty());
+    ghobject_t oid(lru.front());
+    lru.pop_front();
+    rev_lru.erase(oid);
+    return oid;
+  }
+  void insert_object(const ghobject_t &oid) {
+    ceph_assert(rev_lru.find(oid) == rev_lru.end());
+    lru.push_back(oid);
+    rev_lru.insert(make_pair(oid, --lru.end()));
+  }
+
+  ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs;
+
+  /// get next flush to perform
+  bool get_next_should_flush(
+    boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush
+    ); ///< @return false if we are shutting down
+public:
+  enum FS {
+    BTRFS,
+    XFS
+  };
+
+private:
+  FS fs;
+
+  void set_from_conf();
+  bool beyond_limit() const {
+    if (cur_ios < io_limits.first &&
+	pending_wbs.size() < fd_limits.first &&
+	cur_size < size_limits.first)
+      return false;
+    else
+      return true;
+  }
+  bool need_flush() const {
+    if (cur_ios < io_limits.second &&
+	pending_wbs.size() < fd_limits.second &&
+	cur_size < size_limits.second)
+      return false;
+    else
+      return true;
+  }
+
+public:
+  explicit WBThrottle(CephContext *cct);
+  ~WBThrottle() override;
+
+  void start();
+  void stop();
+  /// Set fs as XFS or BTRFS
+  void set_fs(FS new_fs) {
+    Mutex::Locker l(lock);
+    fs = new_fs;
+    set_from_conf();
+  }
+
+  /// Queue wb on oid, fd taking throttle (does not block)
+  void queue_wb(
+    FDRef fd,              ///< [in] FDRef to oid
+    const ghobject_t &oid, ///< [in] object
+    uint64_t offset,       ///< [in] offset written
+    uint64_t len,          ///< [in] length written
+    bool nocache           ///< [in] try to clear out of cache after write
+    );
+
+  /// Clear all wb (probably due to sync)
+  void clear();
+
+  /// Clear object
+  void clear_object(const ghobject_t &oid);
+
+  /// Block until there is throttle available
+  void throttle();
+
+  /// md_config_obs_t
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set<std::string> &changed) override;
+
+  /// Thread
+  void *entry() override;
+};
+
+#endif
diff --git a/src/os/filestore/XfsFileStoreBackend.cc b/src/os/filestore/XfsFileStoreBackend.cc
new file mode 100644
index 00000000..1081d146
--- /dev/null
+++ b/src/os/filestore/XfsFileStoreBackend.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "XfsFileStoreBackend.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/utsname.h>
+
+#include <xfs/xfs.h>
+
+#include "common/errno.h"
+#include "common/linux_version.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") "
+
+XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs):
+  GenericFileStoreBackend(fs), m_has_extsize(false) { }
+
+/*
+ * Set extsize attr on a file to val.  Should be a free-standing
+ * function, but dout_prefix expanding to a call to get_basedir_path()
+ * protected member function won't let it.
+ */
+int XfsFileStoreBackend::set_extsize(int fd, unsigned int val)
+{
+  struct fsxattr fsx;
+  struct stat sb;
+  int ret;
+
+  if (fstat(fd, &sb) < 0) {
+    ret = -errno;
+    dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  if (!S_ISREG(sb.st_mode)) {
+    dout(0) << "set_extsize: invalid target file type" << dendl;
+    return -EINVAL;
+  }
+
+  if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
+    ret = -errno;
+    dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  // already set?
+  if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
+    return 0;
+
+  // xfs won't change extent size if any extents are allocated
+  if (fsx.fsx_nextents != 0)
+    return 0;
+
+  fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
+  fsx.fsx_extsize = val;
+
+  if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
+    ret = -errno;
+    dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int XfsFileStoreBackend::detect_features()
+{
+  int ret;
+
+  ret = GenericFileStoreBackend::detect_features();
+  if (ret < 0)
+    return ret;
+
+  // extsize?
+  int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600);
+  if (fd < 0) {
+    ret = -errno;
+    dout(0) << "detect_feature: failed to create test file for extsize attr: "
+            << cpp_strerror(ret) << dendl;
+    goto out;
+  }
+  if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) {
+    ret = -errno;
+    dout(0) << "detect_feature: failed to unlink test file for extsize attr: "
+            << cpp_strerror(ret) << dendl;
+    goto out_close;
+  }
+
+  if (cct()->_conf->filestore_xfs_extsize) {
+    ret = set_extsize(fd, 1U << 15); // a few pages
+    if (ret) {
+      ret = 0;
+      dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl;
+      goto out_close;
+    }
+
+    // make sure we have 3.5 or newer, which includes this fix
+    //   aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d
+    // for this set_extsize bug
+    //   http://oss.sgi.com/bugzilla/show_bug.cgi?id=874
+    int ver = get_linux_version();
+    if (ver == 0) {
+      dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl;
+      m_has_extsize = false;
+    } else if (ver < KERNEL_VERSION(3, 5, 0)) {
+      dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl;
+      m_has_extsize = false;
+    } else {
+      dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl;
+      m_has_extsize = true;
+    }
+  } else {
+    dout(0) << "detect_feature: extsize is disabled by conf" << dendl;
+  }
+
+out_close:
+  TEMP_FAILURE_RETRY(::close(fd));
+out:
+  return ret;
+}
+
+int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint)
+{
+  if (!m_has_extsize)
+    return -EOPNOTSUPP;
+
+  ceph_assert(hint < UINT_MAX);
+  return set_extsize(fd, hint);
+}
diff --git a/src/os/filestore/XfsFileStoreBackend.h b/src/os/filestore/XfsFileStoreBackend.h
new file mode 100644
index 00000000..e8b81f9a
--- /dev/null
+++ b/src/os/filestore/XfsFileStoreBackend.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_XFSFILESTOREBACKEND_H
+#define CEPH_XFSFILESTOREBACKEND_H
+
+#include "GenericFileStoreBackend.h"
+
+#include "include/int_types.h"
+
+class XfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+  bool m_has_extsize;
+  int set_extsize(int fd, unsigned int val);
+public:
+  explicit XfsFileStoreBackend(FileStore *fs);
+  ~XfsFileStoreBackend() override {}
+  const char *get_name() override {
+    return "xfs";
+  }
+  int detect_features() override;
+  int set_alloc_hint(int fd, uint64_t hint) override;
+};
+
+#endif /* CEPH_XFSFILESTOREBACKEND_H */
diff --git a/src/os/filestore/ZFSFileStoreBackend.cc b/src/os/filestore/ZFSFileStoreBackend.cc
new file mode 100644
index 00000000..e85dbd52
--- /dev/null
+++ b/src/os/filestore/ZFSFileStoreBackend.cc
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/ceph_assert.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+
+#include "ZFSFileStoreBackend.h"
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") "
+
+ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) :
+  GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL),
+  m_filestore_zfs_snap(cct()->_conf->filestore_zfs_snap)
+{
+  int ret = zfs.init();
+  if (ret < 0) {
+    dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl;
+    return;
+  }
+
+  base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM);
+  if (!base_zh) {
+    dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl;
+    return;
+  }
+
+  update_current_zh();
+}
+
+ZFSFileStoreBackend::~ZFSFileStoreBackend()
+{
+  if (base_zh)
+    zfs.close(base_zh);
+  if (current_zh)
+    zfs.close(current_zh);
+}
+
+int ZFSFileStoreBackend::update_current_zh()
+{
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+  ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM);
+  if (zh) {
+    char *mnt;
+    if (zfs.is_mounted(zh, &mnt)) {
+      int ret = get_current_path() == mnt;
+      free(mnt);
+      if (ret) {
+	current_zh = zh;
+	return 0;
+      }
+    } else {
+      int ret = zfs.mount(zh, NULL, 0);
+      if (ret < 0) {
+	ret = -errno;
+	dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh)
+		<< "' got " << cpp_strerror(ret) << dendl;
+	return ret;
+      }
+    }
+    zfs.close(zh);
+  } else {
+    dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl;
+    return -ENOENT;
+  }
+
+  zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM);
+  if (zh) {
+    if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) {
+      current_zh = zh;
+      return 0;
+    }
+    zfs.close(zh);
+    dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl;
+  } else {
+    dout(0) << "update_current_zh: current/ not exist" << dendl;
+  }
+  return -ENOENT;
+}
+
+int ZFSFileStoreBackend::detect_features()
+{
+  if (!current_zh)
+    dout(0) << "detect_features: null zfs handle for current/" << dendl;
+  return 0;
+}
+
+bool ZFSFileStoreBackend::can_checkpoint()
+{
+  return m_filestore_zfs_snap && current_zh != NULL;
+}
+
+int ZFSFileStoreBackend::create_current()
+{
+  struct stat st;
+  int ret = ::stat(get_current_path().c_str(), &st);
+  if (ret == 0) {
+    // current/ exists
+    if (!S_ISDIR(st.st_mode)) {
+      dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+      return -ENOTDIR;
+    }
+    return 0;
+  } else if (errno != ENOENT) {
+    ret = -errno;
+    dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+  ret = zfs.create(path, ZFS::TYPE_FILESYSTEM);
+  if (ret < 0 && errno != EEXIST) {
+    ret = -errno;
+    dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  ret = update_current_zh();
+  return ret;
+}
+
+static int list_checkpoints_callback(ZFS::Handle *zh, void *data)
+{
+  list<string> *ls = static_cast<list<string> *>(data);
+  string str = ZFS::get_name(zh);
+  size_t pos = str.find('@');
+  ceph_assert(pos != string::npos && pos + 1 != str.length());
+  ls->push_back(str.substr(pos + 1));
+  return 0;
+}
+
+int ZFSFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+  dout(10) << "list_checkpoints:" << dendl;
+  if (!current_zh)
+    return -EINVAL;
+
+  list<string> snaps;
+  int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  ls.swap(snaps);
+  return 0;
+}
+
+int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid)
+{
+  dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+  if (!current_zh)
+    return -EINVAL;
+
+  // looks like zfsonlinux doesn't flush dirty data when taking snapshot
+  int ret = sync_filesystem(get_current_fd());
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+  ret = zfs.snapshot(path, false);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  if (cid)
+    *cid = 0;
+  return 0;
+}
+
+int ZFSFileStoreBackend::rollback_to(const string& name)
+{
+  dout(10) << "rollback_to: '" << name << "'" << dendl;
+  if (!current_zh)
+    return -EINVAL;
+
+  // umount current to avoid triggering online rollback deadlock
+  int ret;
+  if (zfs.is_mounted(current_zh, NULL)) {
+    ret = zfs.umount(current_zh, NULL, 0);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl;
+    }
+  }
+
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+
+  ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT);
+  if (!snap_zh) {
+    dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl;
+    return -ENOENT;
+  }
+
+  ret = zfs.rollback(current_zh, snap_zh, false);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl;
+  }
+
+  if (!zfs.is_mounted(current_zh, NULL)) {
+    int ret = zfs.mount(current_zh, NULL, 0);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl;
+      return ret;
+    }
+  }
+
+  zfs.close(snap_zh);
+  return ret;
+}
+
+int ZFSFileStoreBackend::destroy_checkpoint(const string& name)
+{
+  dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+  if (!current_zh)
+    return -EINVAL;
+
+  int ret = zfs.destroy_snaps(current_zh, name.c_str(), true);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl;
+  }
+  return ret;
+}
diff --git a/src/os/filestore/ZFSFileStoreBackend.h b/src/os/filestore/ZFSFileStoreBackend.h
new file mode 100644
index 00000000..b1fa9887
--- /dev/null
+++ b/src/os/filestore/ZFSFileStoreBackend.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ZFSFILESTOREBACKEND_H
+#define CEPH_ZFSFILESTOREBACKEND_H
+
+#ifdef HAVE_LIBZFS
+#include "GenericFileStoreBackend.h"
+#include "os/fs/ZFS.h"
+
+class ZFSFileStoreBackend : public GenericFileStoreBackend {
+private:
+  ZFS zfs;
+  ZFS::Handle *base_zh;
+  ZFS::Handle *current_zh;
+  bool m_filestore_zfs_snap;
+  int update_current_zh();
+public:
+  explicit ZFSFileStoreBackend(FileStore *fs);
+  ~ZFSFileStoreBackend();
+  const char *get_name() override {
+    return "zfs";
+  }
+  int detect_features();
+  bool can_checkpoint();
+  int create_current();
+  int list_checkpoints(list<string>& ls);
+  int create_checkpoint(const string& name, uint64_t *cid);
+  int rollback_to(const string& name);
+  int destroy_checkpoint(const string& name);
+};
+#endif
+#endif
diff --git a/src/os/filestore/chain_xattr.cc b/src/os/filestore/chain_xattr.cc
new file mode 100644
index 00000000..e4dedd29
--- /dev/null
+++ b/src/os/filestore/chain_xattr.cc
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "chain_xattr.h"
+#include <errno.h>           // for ERANGE, ENODATA, ENOMEM
+#include <stdio.h>           // for size_t, snprintf
+#include <stdlib.h>          // for free, malloc
+#include <string.h>          // for strcpy, strlen
+#include "include/ceph_assert.h"  // for assert
+#include "include/buffer.h"
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "include/ceph_assert.h"
+
+/*
+ * chaining xattrs
+ *
+ * In order to support xattrs that are larger than the xattr size limit that some file systems
+ * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys
+ * are set as follows:
+ * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char
+ * being esacped ("@@").
+ * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@<id>"
+ * where <id> marks the num of xattr in the chain.
+ */
+
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len)
+{
+  int pos = 0;
+
+  while (*name) {
+    switch (*name) {
+    case '@': /* escape it */
+      pos += 2;
+      ceph_assert (pos < raw_len - 1);
+      *raw_name = '@';
+      raw_name++;
+      *raw_name = '@';
+      break;
+    default:
+      pos++;
+      ceph_assert(pos < raw_len - 1);
+      *raw_name = *name;
+      break;
+    }
+    name++;
+    raw_name++;
+  }
+
+  if (!i) {
+    *raw_name = '\0';
+  } else {
+    int r = snprintf(raw_name, raw_len - pos, "@%d", i);
+    ceph_assert(r < raw_len - pos);
+  }
+}
+
+static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first)
+{
+  int pos = 0;
+
+  *is_first = true;
+  while (*raw_name) {
+    switch (*raw_name) {
+    case '@': /* escape it */
+      raw_name++;
+      if (!*raw_name)
+        break;
+      if (*raw_name != '@') {
+        *is_first = false;
+        goto done;
+      }
+
+    /* fall through */
+    default:
+      *name = *raw_name;
+      break;
+    }
+    pos++;
+    ceph_assert(pos < name_len);
+    name++;
+    raw_name++;
+  }
+done:
+  *name = '\0';
+  return pos;
+}
+
+
+// setxattr
+
+static int getxattr_len(const char *fn, const char *name)
+{
+  int i = 0, total = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int r;
+
+  do {
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    r = sys_getxattr(fn, raw_name, 0, 0);
+    if (!i && r < 0)
+      return r;
+    if (r < 0)
+      break;
+    total += r;
+    i++;
+  } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	   r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+  return total;
+}
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  int r;
+  size_t chunk_size;
+
+  if (!size)
+    return getxattr_len(fn, name);
+
+  do {
+    chunk_size = size;
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+    r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
+    if (i && r == -ENODATA) {
+      ret = pos;
+      break;
+    }
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+
+    if (r > 0) {
+      pos += r;
+      size -= r;
+    }
+
+    i++;
+  } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+		    r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+  if (r >= 0) {
+    ret = pos;
+    /* is there another chunk? that can happen if the last read size span over
+       exactly one block */
+    if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_getxattr(fn, raw_name, 0, 0);
+      if (r > 0) { // there's another chunk.. the original buffer was too small
+        ret = -ERANGE;
+      }
+    }
+  }
+  return ret;
+}
+
+int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp)
+{
+  size_t size = 1024; // Initial
+  while (1) {
+    bufferptr buf(size);
+    int r = chain_getxattr(
+      fn,
+      name,
+      buf.c_str(),
+      size);
+    if (r > 0) {
+      buf.set_length(r);
+      if (bp)
+	bp->swap(buf);
+      return r;
+    } else if (r == 0) {
+      return 0;
+    } else {
+      if (r == -ERANGE) {
+	size *= 2;
+      } else {
+	return r;
+      }
+    }
+  }
+  ceph_abort_msg("unreachable");
+  return 0;
+}
+
+static int chain_fgetxattr_len(int fd, const char *name)
+{
+  int i = 0, total = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int r;
+
+  do {
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    r = sys_fgetxattr(fd, raw_name, 0, 0);
+    if (!i && r < 0)
+      return r;
+    if (r < 0)
+      break;
+    total += r;
+    i++;
+  } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	   r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+  return total;
+}
+
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  int r;
+  size_t chunk_size;
+
+  if (!size)
+    return chain_fgetxattr_len(fd, name);
+
+  do {
+    chunk_size = size;
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+    r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+    if (i && r == -ENODATA) {
+      ret = pos;
+      break;
+    }
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+
+    if (r > 0) {
+      pos += r;
+      size -= r;
+    }
+
+    i++;
+  } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+		    r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+  if (r >= 0) {
+    ret = pos;
+    /* is there another chunk? that can happen if the last read size span over
+       exactly one block */
+    if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_fgetxattr(fd, raw_name, 0, 0);
+      if (r > 0) { // there's another chunk.. the original buffer was too small
+        ret = -ERANGE;
+      }
+    }
+  }
+  return ret;
+}
+
+
+// setxattr
+
+int get_xattr_block_size(size_t size)
+{
+  if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD)
+    // this may fit in the inode; stripe over short attrs so that XFS
+    // won't kick it out.
+    return CHAIN_XATTR_SHORT_BLOCK_LEN;
+  return CHAIN_XATTR_MAX_BLOCK_LEN;
+}
+
+// removexattr
+
+int chain_removexattr(const char *fn, const char *name)
+{
+  int i = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int r;
+
+  do {
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    r = sys_removexattr(fn, raw_name);
+    if (!i && r < 0) {
+      return r;
+    }
+    i++;
+  } while (r >= 0);
+  return 0;
+}
+
+int chain_fremovexattr(int fd, const char *name)
+{
+  int i = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int r;
+
+  do {
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    r = sys_fremovexattr(fd, raw_name);
+    if (!i && r < 0) {
+      return r;
+    }
+    i++;
+  } while (r >= 0);
+  return 0;
+}
+
+
+// listxattr
+
+int chain_listxattr(const char *fn, char *names, size_t len) {
+  int r;
+
+  if (!len)
+    return sys_listxattr(fn, names, len) * 2;
+
+  r = sys_listxattr(fn, 0, 0);
+  if (r < 0)
+    return r;
+
+  size_t total_len = r * 2; // should be enough
+  char *full_buf = (char *)malloc(total_len);
+  if (!full_buf)
+    return -ENOMEM;
+
+  r = sys_listxattr(fn, full_buf, total_len);
+  if (r < 0) {
+    free(full_buf);
+    return r;
+  }
+
+  char *p = full_buf;
+  const char *end = full_buf + r;
+  char *dest = names;
+  char *dest_end = names + len;
+
+  while (p < end) {
+    char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+    int attr_len = strlen(p);
+    bool is_first;
+    int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+    if (is_first)  {
+      if (dest + name_len > dest_end) {
+        r = -ERANGE;
+        goto done;
+      }
+      strcpy(dest, name);
+      dest += name_len + 1;
+    }
+    p += attr_len + 1;
+  }
+  r = dest - names;
+
+done:
+  free(full_buf);
+  return r;
+}
+
+int chain_flistxattr(int fd, char *names, size_t len) {
+  int r;
+  char *p;
+  const char * end;
+  char *dest;
+  char *dest_end;
+
+  if (!len)
+    return sys_flistxattr(fd, names, len) * 2;
+
+  r = sys_flistxattr(fd, 0, 0);
+  if (r < 0)
+    return r;
+
+  size_t total_len = r * 2; // should be enough
+  char *full_buf = (char *)malloc(total_len);
+  if (!full_buf)
+    return -ENOMEM;
+
+  r = sys_flistxattr(fd, full_buf, total_len);
+  if (r < 0)
+    goto done;
+
+  p = full_buf;
+  end = full_buf + r;
+  dest = names;
+  dest_end = names + len;
+
+  while (p < end) {
+    char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+    int attr_len = strlen(p);
+    bool is_first;
+    int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+    if (is_first)  {
+      if (dest + name_len > dest_end) {
+        r = -ERANGE;
+        goto done;
+      }
+      strcpy(dest, name);
+      dest += name_len + 1;
+    }
+    p += attr_len + 1;
+  }
+  r = dest - names;
+
+done:
+  free(full_buf);
+  return r;
+}
diff --git a/src/os/filestore/chain_xattr.h b/src/os/filestore/chain_xattr.h
new file mode 100644
index 00000000..a2d17fa6
--- /dev/null
+++ b/src/os/filestore/chain_xattr.h
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OSD_CHAIN_XATTR_H
+#define __CEPH_OSD_CHAIN_XATTR_H
+
+#include "include/compat.h"
+#include <errno.h>
+#include <stdio.h>
+#include "common/xattr.h"
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+
+#if defined(__linux__)
+#include <linux/limits.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2)
+#elif defined(__APPLE__)
+#include <sys/xattr.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2)
+#else
+#define CHAIN_XATTR_MAX_NAME_LEN  128
+#endif
+
+#define CHAIN_XATTR_MAX_BLOCK_LEN 2048
+
+/*
+ * XFS will only inline xattrs < 255 bytes, so for xattrs that are
+ * likely to fit in the inode, stripe over short xattrs.
+ */
+#define CHAIN_XATTR_SHORT_BLOCK_LEN 250
+#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000
+
+// wrappers to hide annoying errno handling.
+
+static inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+  int r = ::ceph_os_fgetxattr(fd, name, val, size);
+  return (r < 0 ? -errno : r);
+}
+static inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+  int r = ::ceph_os_getxattr(fn, name, val, size);
+  return (r < 0 ? -errno : r);
+}
+
+static inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size)
+{
+  int r = ::ceph_os_setxattr(fn, name, val, size);
+  return (r < 0 ? -errno : r);
+}
+static inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size)
+{
+  int r = ::ceph_os_fsetxattr(fd, name, val, size);
+  return (r < 0 ? -errno : r);
+}
+
+static inline int sys_listxattr(const char *fn, char *names, size_t len)
+{
+  int r = ::ceph_os_listxattr(fn, names, len);
+  return (r < 0 ? -errno : r);
+}
+static inline int sys_flistxattr(int fd, char *names, size_t len)
+{
+  int r = ::ceph_os_flistxattr(fd, names, len);
+  return (r < 0 ? -errno : r);
+}
+
+static inline int sys_removexattr(const char *fn, const char *name)
+{
+  int r = ::ceph_os_removexattr(fn, name);
+  return (r < 0 ? -errno : r);
+}
+static inline int sys_fremovexattr(int fd, const char *name)
+{
+  int r = ::ceph_os_fremovexattr(fd, name);
+  return (r < 0 ? -errno : r);
+}
+
+
+// wrappers to chain large values across multiple xattrs
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size);
+int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp);
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size);
+
+int get_xattr_block_size(size_t size);
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len);
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_setxattr(
+  const char *fn, const char *name, const void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  size_t max_chunk_size =
+    ensure_single_attr ? size : get_xattr_block_size(size);
+
+  static_assert(
+    !skip_chain_cleanup || ensure_single_attr,
+    "skip_chain_cleanup must imply ensure_single_attr");
+
+  do {
+    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    size -= chunk_size;
+
+    int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size);
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+    pos  += chunk_size;
+    ret = pos;
+    i++;
+    ceph_assert(size == 0 || !ensure_single_attr);
+  } while (size);
+
+  if (ret >= 0 && !skip_chain_cleanup) {
+    int r;
+    do {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_removexattr(fn, raw_name);
+      if (r < 0 && r != -ENODATA)
+	ret = r;
+      i++;
+    } while (r != -ENODATA);
+  }
+
+  return ret;
+}
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_fsetxattr(
+  int fd, const char *name, const void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  size_t max_chunk_size =
+    ensure_single_attr ? size : get_xattr_block_size(size);
+
+  static_assert(
+    !skip_chain_cleanup || ensure_single_attr,
+    "skip_chain_cleanup must imply ensure_single_attr");
+
+  do {
+    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    size -= chunk_size;
+
+    int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+    pos  += chunk_size;
+    ret = pos;
+    i++;
+    ceph_assert(size == 0 || !ensure_single_attr);
+  } while (size);
+
+  if (ret >= 0 && !skip_chain_cleanup) {
+    int r;
+    do {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_fremovexattr(fd, raw_name);
+      if (r < 0 && r != -ENODATA)
+	ret = r;
+      i++;
+    } while (r != -ENODATA);
+  }
+
+  return ret;
+}
+
+int chain_listxattr(const char *fn, char *names, size_t len);
+int chain_flistxattr(int fd, char *names, size_t len);
+int chain_removexattr(const char *fn, const char *name);
+int chain_fremovexattr(int fd, const char *name);
+
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
commit	483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree	e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/os/filestore
parent	Initial commit. (diff)
download	ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip