summaryrefslogtreecommitdiffstats
path: root/src/os/filestore/HashIndex.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/os/filestore/HashIndex.cc')
-rw-r--r--src/os/filestore/HashIndex.cc1226
1 files changed, 1226 insertions, 0 deletions
diff --git a/src/os/filestore/HashIndex.cc b/src/os/filestore/HashIndex.cc
new file mode 100644
index 000000000..75c3d1b67
--- /dev/null
+++ b/src/os/filestore/HashIndex.cc
@@ -0,0 +1,1226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+#include <errno.h>
+
+#include "HashIndex.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferptr;
+using ceph::bufferlist;
+
+const string HashIndex::SUBDIR_ATTR = "contents";
+const string HashIndex::SETTINGS_ATTR = "settings";
+const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
+
+/// hex digit to integer value
+int hex_to_int(char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ ceph_abort();
+}
+
+/// int value to hex digit
+char int_to_hex(int v)
+{
+ ceph_assert(v < 16);
+ if (v < 10)
+ return '0' + v;
+ return 'A' + v - 10;
+}
+
+/// reverse bits in a nibble (0..15)
+int reverse_nibble_bits(int in)
+{
+ ceph_assert(in < 16);
+ return
+ ((in & 8) >> 3) |
+ ((in & 4) >> 1) |
+ ((in & 2) << 1) |
+ ((in & 1) << 3);
+}
+
+/// reverse nibble bits in a hex digit
+char reverse_hexdigit_bits(char c)
+{
+ return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
+}
+
+/// reverse nibble bits in a hex string
+string reverse_hexdigit_bits_string(string s)
+{
+ for (unsigned i=0; i<s.size(); ++i)
+ s[i] = reverse_hexdigit_bits(s[i]);
+ return s;
+}
+
+/// compare hex digit (as length 1 string) bitwise
+bool cmp_hexdigit_bitwise(const string& l, const string& r)
+{
+ ceph_assert(l.length() == 1 && r.length() == 1);
+ int lv = hex_to_int(l[0]);
+ int rv = hex_to_int(r[0]);
+ ceph_assert(lv < 16);
+ ceph_assert(rv < 16);
+ return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
+}
+
+/// compare hex digit string bitwise
+bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
+{
+ string ll = reverse_hexdigit_bits_string(l);
+ string rr = reverse_hexdigit_bits_string(r);
+ return ll < rr;
+}
+
+int HashIndex::cleanup() {
+ bufferlist bl;
+ int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0) {
+ // No in progress operations!
+ return 0;
+ }
+ auto i = bl.cbegin();
+ InProgressOp in_progress(i);
+ subdir_info_s info;
+ r = get_info(in_progress.path, &info);
+ if (r == -ENOENT) {
+ return end_split_or_merge(in_progress.path);
+ } else if (r < 0) {
+ return r;
+ }
+
+ if (in_progress.is_split())
+ return complete_split(in_progress.path, info);
+ else if (in_progress.is_merge())
+ return complete_merge(in_progress.path, info);
+ else if (in_progress.is_col_split()) {
+ for (vector<string>::iterator i = in_progress.path.begin();
+ i != in_progress.path.end();
+ ++i) {
+ vector<string> path(in_progress.path.begin(), i);
+ int r = reset_attr(path);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+ }
+ else
+ return -EINVAL;
+}
+
+int HashIndex::reset_attr(
+ const vector<string> &path)
+{
+ int exists = 0;
+ int r = path_exists(path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists)
+ return 0;
+ map<string, ghobject_t> objects;
+ vector<string> subdirs;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ subdir_info_s info;
+ info.hash_level = path.size();
+ info.objs = objects.size();
+ info.subdirs = subdirs.size();
+ return set_info(path, info);
+}
+
+int HashIndex::col_split_level(
+ HashIndex &from,
+ HashIndex &to,
+ const vector<string> &path,
+ uint32_t inbits,
+ uint32_t match,
+ unsigned *mkdirred)
+{
+ /* For each subdir, move, recurse, or ignore based on comparing the low order
+ * bits of the hash represented by the subdir path with inbits, match passed
+ * in.
+ */
+ vector<string> subdirs;
+ int r = from.list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, ghobject_t> objects;
+ r = from.list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+
+ set<string> to_move;
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ uint32_t bits = 0;
+ uint32_t hash = 0;
+ vector<string> sub_path(path.begin(), path.end());
+ sub_path.push_back(*i);
+ path_to_hobject_hash_prefix(sub_path, &bits, &hash);
+ if (bits < inbits) {
+ if (hobject_t::match_hash(hash, bits, match)) {
+ r = col_split_level(
+ from,
+ to,
+ sub_path,
+ inbits,
+ match,
+ mkdirred);
+ if (r < 0)
+ return r;
+ if (*mkdirred > path.size())
+ *mkdirred = path.size();
+ } // else, skip, doesn't need to be moved or recursed into
+ } else {
+ if (hobject_t::match_hash(hash, inbits, match)) {
+ to_move.insert(*i);
+ }
+ } // else, skip, doesn't need to be moved or recursed into
+ }
+
+ /* Then, do the same for each object */
+ map<string, ghobject_t> objs_to_move;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (i->second.match(inbits, match)) {
+ objs_to_move.insert(*i);
+ }
+ }
+
+ if (objs_to_move.empty() && to_move.empty())
+ return 0;
+
+ // Make parent directories as needed
+ while (*mkdirred < path.size()) {
+ ++*mkdirred;
+ int exists = 0;
+ vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
+ r = to.path_exists(creating_path, &exists);
+ if (r < 0)
+ return r;
+ if (exists)
+ continue;
+ subdir_info_s info;
+ info.objs = 0;
+ info.subdirs = 0;
+ info.hash_level = creating_path.size();
+ if (*mkdirred < path.size() - 1)
+ info.subdirs = 1;
+ r = to.start_col_split(creating_path);
+ if (r < 0)
+ return r;
+ r = to.create_path(creating_path);
+ if (r < 0)
+ return r;
+ r = to.set_info(creating_path, info);
+ if (r < 0)
+ return r;
+ r = to.end_split_or_merge(creating_path);
+ if (r < 0)
+ return r;
+ }
+
+ subdir_info_s from_info;
+ subdir_info_s to_info;
+ r = from.get_info(path, &from_info);
+ if (r < 0)
+ return r;
+ r = to.get_info(path, &to_info);
+ if (r < 0)
+ return r;
+
+ from.start_col_split(path);
+ to.start_col_split(path);
+
+ // Do subdir moves
+ for (set<string>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ from_info.subdirs--;
+ to_info.subdirs++;
+ r = move_subdir(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+ for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
+ i != objs_to_move.end();
+ ++i) {
+ from_info.objs--;
+ to_info.objs++;
+ r = move_object(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+
+ r = to.set_info(path, to_info);
+ if (r < 0)
+ return r;
+ r = from.set_info(path, from_info);
+ if (r < 0)
+ return r;
+ from.end_split_or_merge(path);
+ to.end_split_or_merge(path);
+ return 0;
+}
+
+int HashIndex::_merge(
+ uint32_t bits,
+ CollectionIndex* dest) {
+ dout(20) << __func__ << " bits " << bits << dendl;
+ ceph_assert(collection_version() == dest->collection_version());
+
+ vector<string> emptypath;
+
+ // pre-split to common/target level so that any shared prefix DIR_?
+ // directories already exist at the destination. Since each
+ // directory is a nibble (4 bits),
+ unsigned shared = bits / 4;
+ dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl;
+ if (shared) {
+ split_dirs(emptypath, shared);
+ ((HashIndex*)dest)->split_dirs(emptypath, shared);
+ }
+
+ // now merge the contents
+ _merge_dirs(*this, *(HashIndex*)dest, emptypath);
+
+ return 0;
+}
+
+int HashIndex::_merge_dirs(
+ HashIndex& from,
+ HashIndex& to,
+ const vector<string>& path)
+{
+ dout(20) << __func__ << " path " << path << dendl;
+ int r;
+
+ vector<string> src_subs, dst_subs;
+ r = from.list_subdirs(path, &src_subs);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "from.list_subdirs"
+ << dendl;
+ return r;
+ }
+ r = to.list_subdirs(path, &dst_subs);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "to.list_subdirs"
+ << dendl;
+ return r;
+ }
+
+ for (auto& i : src_subs) {
+ if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) {
+ // move it
+ r = move_subdir(from, to, path, i);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "move_subdir(...,"
+ << path << "," << i << ")"
+ << dendl;
+ return r;
+ }
+ } else {
+ // common, recurse!
+ vector<string> nested = path;
+ nested.push_back(i);
+ r = _merge_dirs(from, to, nested);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "rec _merge_dirs"
+ << dendl;
+ return r;
+ }
+
+ // now remove it
+ r = remove_path(nested);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "remove_path "
+ << nested
+ << dendl;
+ return r;
+ }
+ }
+ }
+
+ // objects
+ map<string, ghobject_t> objects;
+ r = from.list_objects(path, 0, 0, &objects);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "from.list_objects"
+ << dendl;
+ return r;
+ }
+
+ for (auto& i : objects) {
+ r = move_object(from, to, path, i);
+ if (r < 0) {
+ lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+ << " r " << r << " from "
+ << "move_object(...,"
+ << path << "," << i << ")"
+ << dendl;
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+
+int HashIndex::_split(
+ uint32_t match,
+ uint32_t bits,
+ CollectionIndex* dest) {
+ ceph_assert(collection_version() == dest->collection_version());
+ unsigned mkdirred = 0;
+
+ return col_split_level(
+ *this,
+ *static_cast<HashIndex*>(dest),
+ vector<string>(),
+ bits,
+ match,
+ &mkdirred);
+}
+
+int HashIndex::split_dirs(const vector<string> &path, int target_level) {
+ dout(20) << __func__ << " " << path << " target level: "
+ << target_level << dendl;
+ subdir_info_s info;
+ int r = get_info(path, &info);
+ if (r < 0) {
+ dout(10) << "error looking up info for " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (must_split(info, target_level)) {
+ dout(1) << __func__ << " " << path << " has " << info.objs
+ << " objects, " << info.hash_level
+ << " level, starting split in pg " << coll() << "." << dendl;
+ r = initiate_split(path, info);
+ if (r < 0) {
+ dout(10) << "error initiating split on " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = complete_split(path, info);
+ dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
+ << dendl;
+ if (r < 0) {
+ dout(10) << "error completing split on " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ vector<string> subdirs;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0) {
+ dout(10) << "error listing subdirs of " << path << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ for (vector<string>::const_iterator it = subdirs.begin();
+ it != subdirs.end(); ++it) {
+ vector<string> subdir_path(path);
+ subdir_path.push_back(*it);
+ r = split_dirs(subdir_path, target_level);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return r;
+}
+
+int HashIndex::apply_layout_settings(int target_level) {
+ vector<string> path;
+ dout(10) << __func__ << " split multiple = " << split_multiplier
+ << " merge threshold = " << merge_threshold
+ << " split rand factor = " << cct->_conf->filestore_split_rand_factor
+ << " target level = " << target_level
+ << dendl;
+ int r = write_settings();
+ if (r < 0)
+ return r;
+ return split_dirs(path, target_level);
+}
+
+int HashIndex::_init() {
+ subdir_info_s info;
+ vector<string> path;
+ int r = set_info(path, info);
+ if (r < 0)
+ return r;
+ return write_settings();
+}
+
+int HashIndex::write_settings() {
+ if (cct->_conf->filestore_split_rand_factor > 0) {
+ settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
+ } else {
+ settings.split_rand_factor = 0;
+ }
+ vector<string> path;
+ bufferlist bl;
+ settings.encode(bl);
+ return add_attr_path(path, SETTINGS_ATTR, bl);
+}
+
+int HashIndex::read_settings() {
+ vector<string> path;
+ bufferlist bl;
+ int r = get_attr_path(path, SETTINGS_ATTR, bl);
+ if (r == -ENODATA)
+ return 0;
+ if (r < 0) {
+ derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ auto it = bl.cbegin();
+ settings.decode(it);
+ dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
+ return 0;
+}
+
+/* LFNIndex virtual method implementations */
+int HashIndex::_created(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name) {
+ subdir_info_s info;
+ int r;
+ r = get_info(path, &info);
+ if (r < 0)
+ return r;
+ info.objs++;
+ r = set_info(path, info);
+ if (r < 0)
+ return r;
+
+ if (must_split(info)) {
+ dout(1) << __func__ << " " << path << " has " << info.objs
+ << " objects, starting split in pg " << coll() << "." << dendl;
+ int r = initiate_split(path, info);
+ if (r < 0) {
+ derr << __func__ << " error starting split " << path << " in pg "
+ << coll() << ": " << cpp_strerror(r) << dendl;
+ ceph_assert(!cct->_conf->filestore_fail_eio);
+ } else {
+ r = complete_split(path, info);
+ if (r < 0) {
+ derr << __func__ << " error completing split " << path << " in pg "
+ << coll() << ": " << cpp_strerror(r) << dendl;
+ ceph_assert(!cct->_conf->filestore_fail_eio);
+ }
+ dout(1) << __func__ << " " << path << " split completed in pg " << coll()
+ << "." << dendl;
+ }
+ }
+
+ return 0;
+}
+
+int HashIndex::_remove(const vector<string> &path,
+ const ghobject_t &oid,
+ const string &mangled_name) {
+ int r;
+ r = remove_object(path, oid);
+ if (r < 0)
+ return r;
+ subdir_info_s info;
+ r = get_info(path, &info);
+ if (r < 0)
+ return r;
+ info.objs--;
+ r = set_info(path, info);
+ if (r < 0)
+ return r;
+
+ if (must_merge(info)) {
+ dout(1) << __func__ << " " << path << " has " << info.objs
+ << " objects, starting merge in pg " << coll() << "." << dendl;
+ r = initiate_merge(path, info);
+ if (r < 0) {
+ derr << __func__ << " error starting merge " << path << " in pg "
+ << coll() << ": " << cpp_strerror(r) << dendl;
+ ceph_assert(!cct->_conf->filestore_fail_eio);
+ } else {
+ r = complete_merge(path, info);
+ if (r < 0) {
+ derr << __func__ << " error completing merge " << path << " in pg "
+ << coll() << ": " << cpp_strerror(r) << dendl;
+ ceph_assert(!cct->_conf->filestore_fail_eio);
+ }
+ dout(1) << __func__ << " " << path << " merge completed in pg " << coll()
+ << "." << dendl;
+ }
+ }
+
+ return 0;
+}
+
+int HashIndex::_lookup(const ghobject_t &oid,
+ vector<string> *path,
+ string *mangled_name,
+ int *hardlink) {
+ vector<string> path_comp;
+ get_path_components(oid, &path_comp);
+ vector<string>::iterator next = path_comp.begin();
+ int exists;
+ while (1) {
+ int r = path_exists(*path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists) {
+ if (path->empty())
+ return -ENOENT;
+ path->pop_back();
+ break;
+ }
+ if (next == path_comp.end())
+ break;
+ path->push_back(*(next++));
+ }
+ return get_mangled_name(*path, oid, mangled_name, hardlink);
+}
+
+int HashIndex::_collection_list_partial(const ghobject_t &start,
+ const ghobject_t &end,
+ int max_count,
+ vector<ghobject_t> *ls,
+ ghobject_t *next) {
+ vector<string> path;
+ ghobject_t _next;
+ if (!next)
+ next = &_next;
+ *next = start;
+ dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
+ return list_by_hash(path, end, max_count, next, ls);
+}
+
+int HashIndex::prep_delete() {
+ return recursive_remove(vector<string>());
+}
+
+int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
+ int ret;
+ vector<string> path;
+ subdir_info_s root_info;
+ // Make sure there is neither objects nor sub-folders
+ // in this collection
+ ret = get_info(path, &root_info);
+ if (ret < 0)
+ return ret;
+
+ // Do the folder splitting first
+ ret = pre_split_folder(pg_num, expected_num_objs);
+ if (ret < 0)
+ return ret;
+ // Initialize the folder info starting from root
+ return init_split_folder(path, 0);
+}
+
+int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
+{
+ // If folder merging is enabled (by setting the threshold positive),
+ // no need to split
+ if (merge_threshold > 0)
+ return 0;
+ const coll_t c = coll();
+ // Do not split if the expected number of objects in this collection is zero (by default)
+ if (expected_num_objs == 0)
+ return 0;
+
+ // Calculate the number of leaf folders (which actually store files)
+ // need to be created
+ const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
+ uint64_t leavies = expected_num_objs / objs_per_folder ;
+ // No need to split
+ if (leavies == 0 || expected_num_objs == objs_per_folder)
+ return 0;
+
+ spg_t spgid;
+ if (!c.is_pg_prefix(&spgid))
+ return -EINVAL;
+ const ps_t ps = spgid.pgid.ps();
+
+ // the most significant bits of pg_num
+ const int pg_num_bits = calc_num_bits(pg_num - 1);
+ ps_t tmp_id = ps;
+ // calculate the number of levels we only create one sub folder
+ int num = pg_num_bits / 4;
+ // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
+ // so that splitting starts at level 3
+ if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
+ --num;
+ }
+
+ int ret;
+ // Start with creation that only has one subfolder
+ vector<string> paths;
+ int dump_num = num;
+ while (num-- > 0) {
+ ps_t v = tmp_id & 0x0000000f;
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ tmp_id = tmp_id >> 4;
+ }
+
+ // Starting from here, we can split by creating multiple subfolders
+ const int left_bits = pg_num_bits - dump_num * 4;
+ // this variable denotes how many bits (for this level) that can be
+ // used for sub folder splitting
+ int split_bits = 4 - left_bits;
+ // the below logic is inspired by rados.h#ceph_stable_mod,
+ // it basically determines how many sub-folders should we
+ // create for splitting
+ ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT
+ if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
+ ++split_bits;
+ }
+ const uint32_t subs = (1 << split_bits);
+ // Calculate how many levels we create starting from here
+ int level = 0;
+ int level_limit = MAX_HASH_LEVEL - dump_num - 1;
+ uint64_t actual_leaves = subs;
+ while (actual_leaves < leavies && level < level_limit) {
+ ++level;
+ actual_leaves <<= 4;
+ }
+ for (uint32_t i = 0; i < subs; ++i) {
+ ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT
+ int v = tmp_id | (i << ((4 - split_bits) % 4));
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(paths, level);
+ if (ret < 0)
+ return ret;
+ paths.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
+{
+ // Get the number of sub directories for the current path
+ vector<string> subdirs;
+ int ret = list_subdirs(path, &subdirs);
+ if (ret < 0)
+ return ret;
+ subdir_info_s info;
+ info.subdirs = subdirs.size();
+ info.hash_level = hash_level;
+ ret = set_info(path, info);
+ if (ret < 0)
+ return ret;
+ ret = fsync_dir(path);
+ if (ret < 0)
+ return ret;
+
+ // Do the same for subdirs
+ vector<string>::const_iterator iter;
+ for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
+ path.push_back(*iter);
+ ret = init_split_folder(path, hash_level + 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::recursive_create_path(vector<string>& path, int level)
+{
+ if (level == 0)
+ return 0;
+ for (int i = 0; i < 16; ++i) {
+ path.push_back(to_hex(i));
+ int ret = create_path(path);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(path, level - 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::recursive_remove(const vector<string> &path) {
+ return _recursive_remove(path, true);
+}
+
+int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
+ vector<string> subdirs;
+ dout(20) << __func__ << " path=" << path << dendl;
+ int r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, ghobject_t> objects;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ if (!objects.empty())
+ return -ENOTEMPTY;
+ vector<string> subdir(path);
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ subdir.push_back(*i);
+ r = _recursive_remove(subdir, false);
+ if (r < 0)
+ return r;
+ subdir.pop_back();
+ }
+ if (top)
+ return 0;
+ else
+ return remove_path(path);
+}
+
+int HashIndex::start_col_split(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_split(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::SPLIT, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_merge(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::MERGE, path);
+ op_tag.encode(bl);
+ int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+ if (r < 0)
+ return r;
+ return fsync_dir(vector<string>());
+}
+
+int HashIndex::end_split_or_merge(const vector<string> &path) {
+ return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
+}
+
+int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
+ bufferlist buf;
+ int r = get_attr_path(path, SUBDIR_ATTR, buf);
+ if (r < 0)
+ return r;
+ auto bufiter = buf.cbegin();
+ info->decode(bufiter);
+ ceph_assert(path.size() == (unsigned)info->hash_level);
+ return 0;
+}
+
+int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
+ bufferlist buf;
+ ceph_assert(path.size() == (unsigned)info.hash_level);
+ info.encode(buf);
+ return add_attr_path(path, SUBDIR_ATTR, buf);
+}
+
+bool HashIndex::must_merge(const subdir_info_s &info) {
+ return (info.hash_level > 0 &&
+ merge_threshold > 0 &&
+ info.objs < (unsigned)merge_threshold &&
+ info.subdirs == 0);
+}
+
+bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
+ // target_level is used for ceph-objectstore-tool to split dirs offline.
+ // if it is set (defalult is 0) and current hash level < target_level,
+ // this dir would be split no matters how many objects it has.
+ return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
+ ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
+ (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
+}
+
+int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
+ return start_merge(path);
+}
+
+int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
+ vector<string> dst = path;
+ dst.pop_back();
+ subdir_info_s dstinfo;
+ int r, exists;
+ r = path_exists(path, &exists);
+ if (r < 0)
+ return r;
+ r = get_info(dst, &dstinfo);
+ if (r < 0)
+ return r;
+ if (exists) {
+ r = move_objects(path, dst);
+ if (r < 0)
+ return r;
+ r = reset_attr(dst);
+ if (r < 0)
+ return r;
+ r = remove_path(path);
+ if (r < 0)
+ return r;
+ }
+ if (must_merge(dstinfo)) {
+ r = initiate_merge(dst, dstinfo);
+ if (r < 0)
+ return r;
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+ return complete_merge(dst, dstinfo);
+ }
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+ return end_split_or_merge(path);
+}
+
+int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
+ return start_split(path);
+}
+
+int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
+ int level = info.hash_level;
+ map<string, ghobject_t> objects;
+ vector<string> dst = path;
+ int r;
+ dst.push_back("");
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ vector<string> subdirs_vec;
+ r = list_subdirs(path, &subdirs_vec);
+ if (r < 0)
+ return r;
+ set<string> subdirs;
+ subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
+ map<string, map<string, ghobject_t> > mapped;
+ map<string, ghobject_t> moved;
+ int num_moved = 0;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ vector<string> new_path;
+ get_path_components(i->second, &new_path);
+ mapped[new_path[level]][i->first] = i->second;
+ }
+ for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
+ i != mapped.end();
+ ) {
+ dst[level] = i->first;
+ /* If the info already exists, it must be correct,
+ * we may be picking up a partially finished split */
+ subdir_info_s temp;
+ // subdir has already been fully copied
+ if (subdirs.count(i->first) && !get_info(dst, &temp)) {
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ moved[j->first] = j->second;
+ num_moved++;
+ objects.erase(j->first);
+ }
+ ++i;
+ continue;
+ }
+
+ subdir_info_s info_new;
+ info_new.objs = i->second.size();
+ info_new.subdirs = 0;
+ info_new.hash_level = level + 1;
+ if (must_merge(info_new) && !subdirs.count(i->first)) {
+ mapped.erase(i++);
+ continue;
+ }
+
+ // Subdir doesn't yet exist
+ if (!subdirs.count(i->first)) {
+ info.subdirs += 1;
+ r = create_path(dst);
+ if (r < 0)
+ return r;
+ } // else subdir has been created but only partially copied
+
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ moved[j->first] = j->second;
+ num_moved++;
+ objects.erase(j->first);
+ r = link_object(path, dst, j->second, j->first);
+ // May be a partially finished split
+ if (r < 0 && r != -EEXIST) {
+ return r;
+ }
+ }
+
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+
+ // Presence of info must imply that all objects have been copied
+ r = set_info(dst, info_new);
+ if (r < 0)
+ return r;
+
+ r = fsync_dir(dst);
+ if (r < 0)
+ return r;
+
+ ++i;
+ }
+ r = remove_objects(path, moved, &objects);
+ if (r < 0)
+ return r;
+ info.objs = objects.size();
+ r = reset_attr(path);
+ if (r < 0)
+ return r;
+ r = fsync_dir(path);
+ if (r < 0)
+ return r;
+ return end_split_or_merge(path);
+}
+
+void HashIndex::get_path_components(const ghobject_t &oid,
+ vector<string> *path) {
+ char buf[MAX_HASH_LEVEL + 1];
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
+
+ // Path components are the hex characters of oid.hobj.hash, least
+ // significant first
+ for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+ path->push_back(string(&buf[i], 1));
+ }
+}
+
+string HashIndex::get_hash_str(uint32_t hash) {
+ char buf[MAX_HASH_LEVEL + 1];
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
+ string retval;
+ for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+ retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
+ }
+ return retval;
+}
+
+string HashIndex::get_path_str(const ghobject_t &oid) {
+ ceph_assert(!oid.is_max());
+ return get_hash_str(oid.hobj.get_hash());
+}
+
+uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
+ while (prefix.size() < sizeof(uint32_t) * 2) {
+ prefix.push_back('0');
+ }
+ uint32_t hash;
+ sscanf(prefix.c_str(), "%x", &hash);
+ // nibble reverse
+ hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
+ hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
+ hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
+ return hash;
+}
+
+int HashIndex::get_path_contents_by_hash_bitwise(
+ const vector<string> &path,
+ const ghobject_t *next_object,
+ set<string, CmpHexdigitStringBitwise> *hash_prefixes,
+ set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
+{
+ map<string, ghobject_t> rev_objects;
+ int r;
+ r = list_objects(path, 0, 0, &rev_objects);
+ if (r < 0)
+ return r;
+ // bitwise sort
+ for (map<string, ghobject_t>::iterator i = rev_objects.begin();
+ i != rev_objects.end();
+ ++i) {
+ if (next_object && i->second < *next_object)
+ continue;
+ string hash_prefix = get_path_str(i->second);
+ hash_prefixes->insert(hash_prefix);
+ objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
+ }
+ vector<string> subdirs;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ // sort subdirs bitwise (by reversing hex digit nibbles)
+ std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
+
+ // Local to this function, we will convert the prefix strings
+ // (previously simply the reversed hex digits) to also have each
+ // digit's nibbles reversed. This will make the strings sort
+ // bitwise.
+ string cur_prefix;
+ for (vector<string>::const_iterator i = path.begin();
+ i != path.end();
+ ++i) {
+ cur_prefix.append(reverse_hexdigit_bits_string(*i));
+ }
+ string next_object_string;
+ if (next_object)
+ next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
+ for (vector<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
+ if (next_object) {
+ if (next_object->is_max())
+ continue;
+ if (candidate < next_object_string.substr(0, candidate.size()))
+ continue;
+ }
+ // re-reverse the hex digit nibbles for the caller
+ hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
+ }
+ return 0;
+}
+
+int HashIndex::list_by_hash(const vector<string> &path,
+ const ghobject_t &end,
+ int max_count,
+ ghobject_t *next,
+ vector<ghobject_t> *out)
+{
+ ceph_assert(out);
+ return list_by_hash_bitwise(path, end, max_count, next, out);
+}
+
+int HashIndex::list_by_hash_bitwise(
+ const vector<string> &path,
+ const ghobject_t& end,
+ int max_count,
+ ghobject_t *next,
+ vector<ghobject_t> *out)
+{
+ vector<string> next_path = path;
+ next_path.push_back("");
+ set<string, CmpHexdigitStringBitwise> hash_prefixes;
+ set<pair<string, ghobject_t>, CmpPairBitwise> objects;
+ int r = get_path_contents_by_hash_bitwise(path,
+ next,
+ &hash_prefixes,
+ &objects);
+ if (r < 0)
+ return r;
+ for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
+ i != hash_prefixes.end();
+ ++i) {
+ dout(20) << __func__ << " prefix " << *i << dendl;
+ set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
+ make_pair(*i, ghobject_t()));
+ if (j == objects.end() || j->first != *i) {
+ *(next_path.rbegin()) = *(i->rbegin());
+ ghobject_t next_recurse;
+ if (next)
+ next_recurse = *next;
+ r = list_by_hash_bitwise(next_path,
+ end,
+ max_count,
+ &next_recurse,
+ out);
+
+ if (r < 0)
+ return r;
+ if (!next_recurse.is_max()) {
+ if (next)
+ *next = next_recurse;
+ return 0;
+ }
+ } else {
+ while (j != objects.end() && j->first == *i) {
+ if (max_count > 0 && out->size() == (unsigned)max_count) {
+ if (next)
+ *next = j->second;
+ return 0;
+ }
+ if (j->second >= end) {
+ if (next)
+ *next = j->second;
+ return 0;
+ }
+ if (!next || j->second >= *next) {
+ dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
+ out->push_back(j->second);
+ }
+ ++j;
+ }
+ }
+ }
+ if (next)
+ *next = ghobject_t::get_max();
+ return 0;
+}
+
+