summaryrefslogtreecommitdiffstats
path: root/src/key_value_store
diff options
context:
space:
mode:
Diffstat (limited to 'src/key_value_store')
-rw-r--r--src/key_value_store/CMakeLists.txt7
-rw-r--r--src/key_value_store/cls_kvs.cc690
-rw-r--r--src/key_value_store/key_value_structure.h149
-rw-r--r--src/key_value_store/kv_flat_btree_async.cc2346
-rw-r--r--src/key_value_store/kv_flat_btree_async.h899
-rw-r--r--src/key_value_store/kvs_arg_types.h144
6 files changed, 4235 insertions, 0 deletions
diff --git a/src/key_value_store/CMakeLists.txt b/src/key_value_store/CMakeLists.txt
new file mode 100644
index 00000000..0b17ede1
--- /dev/null
+++ b/src/key_value_store/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(kvs_srcs cls_kvs.cc)
+add_library(cls_kvs SHARED ${kvs_srcs})
+set_target_properties(cls_kvs PROPERTIES
+ VERSION "1.0.0"
+ SOVERSION "1"
+ INSTALL_RPATH "")
+install(TARGETS cls_kvs DESTINATION ${CMAKE_INSTALL_LIBDIR}/rados-classes)
diff --git a/src/key_value_store/cls_kvs.cc b/src/key_value_store/cls_kvs.cc
new file mode 100644
index 00000000..d206e374
--- /dev/null
+++ b/src/key_value_store/cls_kvs.cc
@@ -0,0 +1,690 @@
+/*
+ * OSD classes for the key value store
+ *
+ * Created on: Aug 10, 2012
+ * Author: Eleanor Cawthon
+ */
+
+#include "include/compat.h"
+#include "objclass/objclass.h"
+#include <errno.h>
+#include "key_value_store/kvs_arg_types.h"
+#include "include/types.h"
+#include <iostream>
+#include <climits>
+
+
+/**
+ * finds the index_data where a key belongs.
+ *
+ * @param key: the key to search for
+ * @param idata: the index_data for the first index value such that idata.key
+ * is greater than key.
+ * @param next_idata: the index_data for the next index entry after idata
+ * @pre: key is not encoded
+ * @post: idata contains complete information
+ * stored
+ */
+static int get_idata_from_key(cls_method_context_t hctx, const string &key,
+ index_data &idata, index_data &next_idata) {
+ bufferlist raw_val;
+ int r = 0;
+ std::map<std::string, bufferlist> kvmap;
+
+ bool more;
+
+ r = cls_cxx_map_get_vals(hctx, key_data(key).encoded(), "", 2, &kvmap, &more);
+ if (r < 0) {
+ CLS_LOG(20, "error reading index for range %s: %d", key.c_str(), r);
+ return r;
+ }
+
+ r = cls_cxx_map_get_val(hctx, key_data(key).encoded(), &raw_val);
+ if (r == 0){
+ CLS_LOG(20, "%s is already in the index: %d", key.c_str(), r);
+ auto b = raw_val.cbegin();
+ idata.decode(b);
+ if (!kvmap.empty()) {
+ auto b = kvmap.begin()->second.cbegin();
+ next_idata.decode(b);
+ }
+ return r;
+ } else if (r == -ENOENT || r == -ENODATA) {
+ auto b = kvmap.begin()->second.cbegin();
+ idata.decode(b);
+ if (idata.kdata.prefix != "1") {
+ auto nb = (++kvmap.begin())->second.cbegin();
+ next_idata.decode(nb);
+ }
+ r = 0;
+ } else if (r < 0) {
+ CLS_LOG(20, "error reading index for duplicates %s: %d", key.c_str(), r);
+ return r;
+ }
+
+ CLS_LOG(20, "idata is %s", idata.str().c_str());
+ return r;
+}
+
+
+static int get_idata_from_key_op(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out) {
+ CLS_LOG(20, "get_idata_from_key_op");
+ idata_from_key_args op;
+ auto it = in->cbegin();
+ try {
+ decode(op, it);
+ } catch (buffer::error& err) {
+ CLS_LOG(20, "error decoding idata_from_key_args.");
+ return -EINVAL;
+ }
+ int r = get_idata_from_key(hctx, op.key, op.idata, op.next_idata);
+ if (r < 0) {
+ return r;
+ } else {
+ encode(op, *out);
+ return 0;
+ }
+}
+
+/**
+ * finds the object in the index with the lowest key value that is greater
+ * than idata.key. If idata.key is the max key, returns -EOVERFLOW. If
+ * idata has a prefix and has timed out, cleans up.
+ *
+ * @param idata: idata for the object to search for.
+ * @param out_data: the idata for the next object.
+ *
+ * @pre: idata must contain a key.
+ * @post: out_data contains complete information
+ */
+static int get_next_idata(cls_method_context_t hctx, const index_data &idata,
+ index_data &out_data) {
+ int r = 0;
+ std::map<std::string, bufferlist> kvs;
+ bool more;
+ r = cls_cxx_map_get_vals(hctx, idata.kdata.encoded(), "", 1, &kvs, &more);
+ if (r < 0){
+ CLS_LOG(20, "getting kvs failed with error %d", r);
+ return r;
+ }
+
+ if (!kvs.empty()) {
+ out_data.kdata.parse(kvs.begin()->first);
+ auto b = kvs.begin()->second.cbegin();
+ out_data.decode(b);
+ } else {
+ r = -EOVERFLOW;
+ }
+
+ return r;
+}
+
+static int get_next_idata_op(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out) {
+ CLS_LOG(20, "get_next_idata_op");
+ idata_from_idata_args op;
+ auto it = in->cbegin();
+ try {
+ decode(op, it);
+ } catch (buffer::error& err) {
+ return -EINVAL;
+ }
+ int r = get_next_idata(hctx, op.idata, op.next_idata);
+ if (r < 0) {
+ return r;
+ } else {
+ op.encode(*out);
+ return 0;
+ }
+}
+
+/**
+ * finds the object in the index with the highest key value that is less
+ * than idata.key. If idata.key is the lowest key, returns -ERANGE If
+ * idata has a prefix and has timed out, cleans up.
+ *
+ * @param idata: idata for the object to search for.
+ * @param out_data: the idata for the next object.
+ *
+ * @pre: idata must contain a key.
+ * @ost: out_data contains complete information
+ */
+static int get_prev_idata(cls_method_context_t hctx, const index_data &idata,
+ index_data &out_data) {
+ int r = 0;
+ std::map<std::string, bufferlist> kvs;
+ bool more;
+ r = cls_cxx_map_get_vals(hctx, "", "", LONG_MAX, &kvs, &more);
+ if (r < 0){
+ CLS_LOG(20, "getting kvs failed with error %d", r);
+ return r;
+ }
+
+ std::map<std::string, bufferlist>::iterator it =
+ kvs.lower_bound(idata.kdata.encoded());
+ if (it->first != idata.kdata.encoded()) {
+ CLS_LOG(20, "object %s not found in the index (expected %s, found %s)",
+ idata.str().c_str(), idata.kdata.encoded().c_str(),
+ it->first.c_str());
+ return -ENODATA;
+ }
+ if (it == kvs.begin()) {
+ //it is the first object, there is no previous.
+ return -ERANGE;
+ } else {
+ --it;
+ }
+ out_data.kdata.parse(it->first);
+ auto b = it->second.cbegin();
+ out_data.decode(b);
+
+ return 0;
+}
+
+static int get_prev_idata_op(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out) {
+ CLS_LOG(20, "get_next_idata_op");
+ idata_from_idata_args op;
+ auto it = in->cbegin();
+ try {
+ decode(op, it);
+ } catch (buffer::error& err) {
+ return -EINVAL;
+ }
+ int r = get_prev_idata(hctx, op.idata, op.next_idata);
+ if (r < 0) {
+ return r;
+ } else {
+ op.encode(*out);
+ return 0;
+ }
+}
+
+/**
+ * Read all of the index entries where any keys in the map go
+ */
+static int read_many(cls_method_context_t hctx, const set<string> &keys,
+ map<string, bufferlist> * out) {
+ int r = 0;
+ bool more;
+ CLS_ERR("reading from a map of size %d, first key encoded is %s",
+ (int)keys.size(), key_data(*keys.begin()).encoded().c_str());
+ r = cls_cxx_map_get_vals(hctx, key_data(*keys.begin()).encoded().c_str(),
+ "", LONG_MAX, out, &more);
+ if (r < 0) {
+ CLS_ERR("getting omap vals failed with error %d", r);
+ }
+
+ CLS_ERR("got map of size %d ", (int)out->size());
+ if (out->size() > 1) {
+ out->erase(out->upper_bound(key_data(*keys.rbegin()).encoded().c_str()),
+ out->end());
+ }
+ CLS_ERR("returning map of size %d", (int)out->size());
+ return r;
+}
+
+static int read_many_op(cls_method_context_t hctx, bufferlist *in,
+ bufferlist *out) {
+ CLS_LOG(20, "read_many_op");
+ set<string> op;
+ map<string, bufferlist> outmap;
+ auto it = in->cbegin();
+ try {
+ decode(op, it);
+ } catch (buffer::error & err) {
+ return -EINVAL;
+ }
+ int r = read_many(hctx, op, &outmap);
+ if (r < 0) {
+ return r;
+ } else {
+ encode(outmap, *out);
+ return 0;
+ }
+}
+
+/**
+ * Checks the unwritable xattr. If it is "1" (i.e., it is unwritable), returns
+ * -EACCES. otherwise, returns 0.
+ */
+static int check_writable(cls_method_context_t hctx) {
+ bufferlist bl;
+ int r = cls_cxx_getxattr(hctx, "unwritable", &bl);
+ if (r < 0) {
+ CLS_LOG(20, "error reading xattr %s: %d", "unwritable", r);
+ return r;
+ }
+ if (string(bl.c_str(), bl.length()) == "1") {
+ return -EACCES;
+ } else{
+ return 0;
+ }
+}
+
+static int check_writable_op(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out) {
+ CLS_LOG(20, "check_writable_op");
+ return check_writable(hctx);
+}
+
+/**
+ * returns -EKEYREJECTED if size is outside of bound, according to comparator.
+ *
+ * @bound: the limit to test
+ * @comparator: should be CEPH_OSD_CMPXATTR_OP_[EQ|GT|LT]
+ */
+static int assert_size_in_bound(cls_method_context_t hctx, int bound,
+ int comparator) {
+ //determine size
+ bufferlist size_bl;
+ int r = cls_cxx_getxattr(hctx, "size", &size_bl);
+ if (r < 0) {
+ CLS_LOG(20, "error reading xattr %s: %d", "size", r);
+ return r;
+ }
+
+ int size = atoi(string(size_bl.c_str(), size_bl.length()).c_str());
+ CLS_LOG(20, "size is %d, bound is %d", size, bound);
+
+ //compare size to comparator
+ switch (comparator) {
+ case CEPH_OSD_CMPXATTR_OP_EQ:
+ if (size != bound) {
+ return -EKEYREJECTED;
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_OP_LT:
+ if (size >= bound) {
+ return -EKEYREJECTED;
+ }
+ break;
+ case CEPH_OSD_CMPXATTR_OP_GT:
+ if (size <= bound) {
+ return -EKEYREJECTED;
+ }
+ break;
+ default:
+ CLS_LOG(20, "invalid argument passed to assert_size_in_bound: %d",
+ comparator);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int assert_size_in_bound_op(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out) {
+ CLS_LOG(20, "assert_size_in_bound_op");
+ assert_size_args op;
+ auto it = in->cbegin();
+ try {
+ decode(op, it);
+ } catch (buffer::error& err) {
+ return -EINVAL;
+ }
+ return assert_size_in_bound(hctx, op.bound, op.comparator);
+}
+
+/**
+ * Attempts to insert omap into this object's omap.
+ *
+ * @return:
+ * if unwritable, returns -EACCES.
+ * if size > bound and key doesn't already exist in the omap, returns -EBALANCE.
+ * if exclusive is true, returns -EEXIST if any keys already exist.
+ *
+ * @post: object has omap entries inserted, and size xattr is updated
+ */
+static int omap_insert(cls_method_context_t hctx,
+ const map<string, bufferlist> &omap, int bound, bool exclusive) {
+
+ uint64_t size;
+ time_t time;
+ int r = cls_cxx_stat(hctx, &size, &time);
+ if (r < 0) {
+ return r;
+ }
+ CLS_LOG(20, "inserting %s", omap.begin()->first.c_str());
+ r = check_writable(hctx);
+ if (r < 0) {
+ CLS_LOG(20, "omap_insert: this object is unwritable: %d", r);
+ return r;
+ }
+
+ int assert_bound = bound;
+
+ //if this is an exclusive insert, make sure the key doesn't already exist.
+ for (map<string, bufferlist>::const_iterator it = omap.begin();
+ it != omap.end(); ++it) {
+ bufferlist bl;
+ r = cls_cxx_map_get_val(hctx, it->first, &bl);
+ if (r == 0 && string(bl.c_str(), bl.length()) != ""){
+ if (exclusive) {
+ CLS_LOG(20, "error: this is an exclusive insert and %s exists.",
+ it->first.c_str());
+ return -EEXIST;
+ }
+ assert_bound++;
+ CLS_LOG(20, "increased assert_bound to %d", assert_bound);
+ } else if (r != -ENODATA && r != -ENOENT) {
+ CLS_LOG(20, "error reading omap val for %s: %d", it->first.c_str(), r);
+ return r;
+ }
+ }
+
+ bufferlist old_size;
+ r = cls_cxx_getxattr(hctx, "size", &old_size);
+ if (r < 0) {
+ CLS_LOG(20, "error reading xattr %s: %d", "size", r);
+ return r;
+ }
+
+ int old_size_int = atoi(string(old_size.c_str(), old_size.length()).c_str());
+
+ CLS_LOG(20, "asserting size is less than %d (bound is %d)", assert_bound, bound);
+ if (old_size_int >= assert_bound) {
+ return -EKEYREJECTED;
+ }
+
+ int new_size_int = old_size_int + omap.size() - (assert_bound - bound);
+ CLS_LOG(20, "old size is %d, new size is %d", old_size_int, new_size_int);
+ bufferlist new_size;
+ stringstream s;
+ s << new_size_int;
+ new_size.append(s.str());
+
+ r = cls_cxx_map_set_vals(hctx, &omap);
+ if (r < 0) {
+ CLS_LOG(20, "error setting omap: %d", r);
+ return r;
+ }
+
+ r = cls_cxx_setxattr(hctx, "size", &new_size);
+ if (r < 0) {
+ CLS_LOG(20, "error setting xattr %s: %d", "size", r);
+ return r;
+ }
+ CLS_LOG(20, "successfully inserted %s", omap.begin()->first.c_str());
+ return 0;
+}
+
+static int omap_insert_op(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out) {
+ CLS_LOG(20, "omap_insert");
+ omap_set_args op;
+ auto it = in->cbegin();
+ try {
+ decode(op, it);
+ } catch (buffer::error& err) {
+ return -EINVAL;
+ }
+ return omap_insert(hctx, op.omap, op.bound, op.exclusive);
+}
+
+static int create_with_omap(cls_method_context_t hctx,
+ const map<string, bufferlist> &omap) {
+ CLS_LOG(20, "creating with omap: %s", omap.begin()->first.c_str());
+ //first make sure the object is writable
+ int r = cls_cxx_create(hctx, true);
+ if (r < 0) {
+ CLS_LOG(20, "omap create: creating failed: %d", r);
+ return r;
+ }
+
+ int new_size_int = omap.size();
+ CLS_LOG(20, "omap insert: new size is %d", new_size_int);
+ bufferlist new_size;
+ stringstream s;
+ s << new_size_int;
+ new_size.append(s.str());
+
+ r = cls_cxx_map_set_vals(hctx, &omap);
+ if (r < 0) {
+ CLS_LOG(20, "omap create: error setting omap: %d", r);
+ return r;
+ }
+
+ r = cls_cxx_setxattr(hctx, "size", &new_size);
+ if (r < 0) {
+ CLS_LOG(20, "omap create: error setting xattr %s: %d", "size", r);
+ return r;
+ }
+
+ bufferlist u;
+ u.append("0");
+ r = cls_cxx_setxattr(hctx, "unwritable", &u);
+ if (r < 0) {
+ CLS_LOG(20, "omap create: error setting xattr %s: %d", "unwritable", r);
+ return r;
+ }
+
+ CLS_LOG(20, "successfully created %s", omap.begin()->first.c_str());
+ return 0;
+}
+
+static int create_with_omap_op(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out) {
+ CLS_LOG(20, "omap_insert");
+ map<string, bufferlist> omap;
+ auto it = in->cbegin();
+ try {
+ decode(omap, it);
+ } catch (buffer::error& err) {
+ return -EINVAL;
+ }
+ return create_with_omap(hctx, omap);
+}
+
+/**
+ * Attempts to remove omap from this object's omap.
+ *
+ * @return:
+ * if unwritable, returns -EACCES.
+ * if size < bound and key doesn't already exist in the omap, returns -EBALANCE.
+ * if any of the keys are not in this object, returns -ENODATA.
+ *
+ * @post: object has omap entries removed, and size xattr is updated
+ */
+static int omap_remove(cls_method_context_t hctx,
+ const std::set<string> &omap, int bound) {
+ int r;
+ uint64_t size;
+ time_t time;
+ r = cls_cxx_stat(hctx, &size, &time);
+ if (r < 0) {
+ return r;
+ }
+
+ //first make sure the object is writable
+ r = check_writable(hctx);
+ if (r < 0) {
+ return r;
+ }
+
+ //check for existance of the key first
+ for (set<string>::const_iterator it = omap.begin();
+ it != omap.end(); ++it) {
+ bufferlist bl;
+ r = cls_cxx_map_get_val(hctx, *it, &bl);
+ if (r == -ENOENT || r == -ENODATA
+ || string(bl.c_str(), bl.length()) == ""){
+ return -ENODATA;
+ } else if (r < 0) {
+ CLS_LOG(20, "error reading omap val for %s: %d", it->c_str(), r);
+ return r;
+ }
+ }
+
+ //fail if removing from an object with only bound entries.
+ bufferlist old_size;
+ r = cls_cxx_getxattr(hctx, "size", &old_size);
+ if (r < 0) {
+ CLS_LOG(20, "error reading xattr %s: %d", "size", r);
+ return r;
+ }
+ int old_size_int = atoi(string(old_size.c_str(), old_size.length()).c_str());
+
+ CLS_LOG(20, "asserting size is greater than %d", bound);
+ if (old_size_int <= bound) {
+ return -EKEYREJECTED;
+ }
+
+ int new_size_int = old_size_int - omap.size();
+ CLS_LOG(20, "old size is %d, new size is %d", old_size_int, new_size_int);
+ bufferlist new_size;
+ stringstream s;
+ s << new_size_int;
+ new_size.append(s.str());
+
+ r = cls_cxx_setxattr(hctx, "size", &new_size);
+ if (r < 0) {
+ CLS_LOG(20, "error setting xattr %s: %d", "unwritable", r);
+ return r;
+ }
+
+ for (std::set<string>::const_iterator it = omap.begin();
+ it != omap.end(); ++it) {
+ r = cls_cxx_map_remove_key(hctx, *it);
+ if (r < 0) {
+ CLS_LOG(20, "error removing omap: %d", r);
+ return r;
+ }
+ }
+ return 0;
+}
+
+static int omap_remove_op(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out) {
+ CLS_LOG(20, "omap_remove");
+ omap_rm_args op;
+ auto it = in->cbegin();
+ try {
+ decode(op, it);
+ } catch (buffer::error& err) {
+ return -EINVAL;
+ }
+ return omap_remove(hctx, op.omap, op.bound);
+}
+
+/**
+ * checks to see if this object needs to be split or rebalanced. if so, reads
+ * information about it.
+ *
+ * @post: if assert_size_in_bound(hctx, bound, comparator) succeeds,
+ * odata contains the size, omap, and unwritable attributes for this object.
+ * Otherwise, odata contains the size and unwritable attribute.
+ */
+static int maybe_read_for_balance(cls_method_context_t hctx,
+ object_data &odata, int bound, int comparator) {
+ CLS_LOG(20, "rebalance reading");
+ //if unwritable, return
+ int r = check_writable(hctx);
+ if (r < 0) {
+ odata.unwritable = true;
+ CLS_LOG(20, "rebalance read: error getting xattr %s: %d", "unwritable", r);
+ return r;
+ } else {
+ odata.unwritable = false;
+ }
+
+ //get the size attribute
+ bufferlist size;
+ r = cls_cxx_getxattr(hctx, "size", &size);
+ if (r < 0) {
+ CLS_LOG(20, "rebalance read: error getting xattr %s: %d", "size", r);
+ return r;
+ }
+ odata.size = atoi(string(size.c_str(), size.length()).c_str());
+
+ //check if it needs to be balanced
+ r = assert_size_in_bound(hctx, bound, comparator);
+ if (r < 0) {
+ CLS_LOG(20, "rebalance read: error on asserting size: %d", r);
+ return -EBALANCE;
+ }
+
+ //if the assert succeeded, it needs to be balanced
+ bool more;
+ r = cls_cxx_map_get_vals(hctx, "", "", LONG_MAX, &odata.omap, &more);
+ if (r < 0){
+ CLS_LOG(20, "rebalance read: getting kvs failed with error %d", r);
+ return r;
+ }
+
+ CLS_LOG(20, "rebalance read: size xattr is %llu, omap size is %llu",
+ (unsigned long long)odata.size,
+ (unsigned long long)odata.omap.size());
+ return 0;
+}
+
+static int maybe_read_for_balance_op(cls_method_context_t hctx,
+ bufferlist *in, bufferlist *out) {
+ CLS_LOG(20, "maybe_read_for_balance");
+ rebalance_args op;
+ auto it = in->cbegin();
+ try {
+ decode(op, it);
+ } catch (buffer::error& err) {
+ return -EINVAL;
+ }
+ int r = maybe_read_for_balance(hctx, op.odata, op.bound, op.comparator);
+ if (r < 0) {
+ return r;
+ } else {
+ op.encode(*out);
+ return 0;
+ }
+}
+
+
+CLS_INIT(kvs)
+{
+ CLS_LOG(20, "Loaded assert condition class!");
+
+ cls_handle_t h_class;
+ cls_method_handle_t h_get_idata_from_key;
+ cls_method_handle_t h_get_next_idata;
+ cls_method_handle_t h_get_prev_idata;
+ cls_method_handle_t h_read_many;
+ cls_method_handle_t h_check_writable;
+ cls_method_handle_t h_assert_size_in_bound;
+ cls_method_handle_t h_omap_insert;
+ cls_method_handle_t h_create_with_omap;
+ cls_method_handle_t h_omap_remove;
+ cls_method_handle_t h_maybe_read_for_balance;
+
+ cls_register("kvs", &h_class);
+ cls_register_cxx_method(h_class, "get_idata_from_key",
+ CLS_METHOD_RD,
+ get_idata_from_key_op, &h_get_idata_from_key);
+ cls_register_cxx_method(h_class, "get_next_idata",
+ CLS_METHOD_RD,
+ get_next_idata_op, &h_get_next_idata);
+ cls_register_cxx_method(h_class, "get_prev_idata",
+ CLS_METHOD_RD,
+ get_prev_idata_op, &h_get_prev_idata);
+ cls_register_cxx_method(h_class, "read_many",
+ CLS_METHOD_RD,
+ read_many_op, &h_read_many);
+ cls_register_cxx_method(h_class, "check_writable",
+ CLS_METHOD_RD | CLS_METHOD_WR,
+ check_writable_op, &h_check_writable);
+ cls_register_cxx_method(h_class, "assert_size_in_bound",
+ CLS_METHOD_WR,
+ assert_size_in_bound_op, &h_assert_size_in_bound);
+ cls_register_cxx_method(h_class, "omap_insert",
+ CLS_METHOD_WR,
+ omap_insert_op, &h_omap_insert);
+ cls_register_cxx_method(h_class, "create_with_omap",
+ CLS_METHOD_WR,
+ create_with_omap_op, &h_create_with_omap);
+ cls_register_cxx_method(h_class, "omap_remove",
+ CLS_METHOD_WR,
+ omap_remove_op, &h_omap_remove);
+ cls_register_cxx_method(h_class, "maybe_read_for_balance",
+ CLS_METHOD_RD,
+ maybe_read_for_balance_op, &h_maybe_read_for_balance);
+
+ return;
+}
diff --git a/src/key_value_store/key_value_structure.h b/src/key_value_store/key_value_structure.h
new file mode 100644
index 00000000..c73adc1a
--- /dev/null
+++ b/src/key_value_store/key_value_structure.h
@@ -0,0 +1,149 @@
+/*
+ * Interface for key-value store using librados
+ *
+ * September 2, 2012
+ * Eleanor Cawthon
+ * eleanor.cawthon@inktank.com
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef KEY_VALUE_STRUCTURE_HPP_
+#define KEY_VALUE_STRUCTURE_HPP_
+
+#include "include/rados/librados.hpp"
+#include "include/utime.h"
+#include <vector>
+
+using std::string;
+using std::map;
+using std::set;
+using ceph::bufferlist;
+
+class KeyValueStructure;
+
+/**An injection_t is a function that is called before every
+ * ObjectWriteOperation to test concurrency issues. For example,
+ * one injection_t might cause the client to have a greater chance of dying
+ * mid-split/merge.
+ */
+typedef int (KeyValueStructure::*injection_t)();
+
+/**
+ * Passed to aio methods to be called when the operation completes
+ */
+typedef void (*callback)(int * err, void *arg);
+
+class KeyValueStructure{
+public:
+ map<char, int> opmap;
+
+ //these are injection methods. By default, nothing is called at each
+ //interruption point.
+ /**
+ * returns 0
+ */
+ virtual int nothing() = 0;
+ /**
+ * 10% chance of waiting wait_ms seconds
+ */
+ virtual int wait() = 0;
+ /**
+ * 10% chance of killing the client.
+ */
+ virtual int suicide() = 0;
+
+ ////////////////DESTRUCTOR/////////////////
+ virtual ~KeyValueStructure() {}
+
+ ////////////////UPDATERS///////////////////
+
+ /**
+ * set up the KeyValueStructure (i.e., initialize rados/io_ctx, etc.)
+ */
+ virtual int setup(int argc, const char** argv) = 0;
+
+ /**
+ * set the method that gets called before each ObjectWriteOperation.
+ * If waite_time is set and the method passed involves waiting, it will wait
+ * for that many milliseconds.
+ */
+ virtual void set_inject(injection_t inject, int wait_time) = 0;
+
+ /**
+ * if update_on_existing is false, returns an error if
+ * key already exists in the structure
+ */
+ virtual int set(const string &key, const bufferlist &val,
+ bool update_on_existing) = 0;
+
+ /**
+ * efficiently insert the contents of in_map into the structure
+ */
+ virtual int set_many(const map<string, bufferlist> &in_map) = 0;
+
+ /**
+ * removes the key-value for key. returns an error if key does not exist
+ */
+ virtual int remove(const string &key) = 0;
+
+ /**
+ * removes all keys and values
+ */
+ virtual int remove_all() = 0;
+
+
+ /**
+ * launches a thread to get the value of key. When complete, calls cb(cb_args)
+ */
+ virtual void aio_get(const string &key, bufferlist *val, callback cb,
+ void *cb_args, int * err) = 0;
+
+ /**
+ * launches a thread to set key to val. When complete, calls cb(cb_args)
+ */
+ virtual void aio_set(const string &key, const bufferlist &val, bool exclusive,
+ callback cb, void * cb_args, int * err) = 0;
+
+ /**
+ * launches a thread to remove key. When complete, calls cb(cb_args)
+ */
+ virtual void aio_remove(const string &key, callback cb, void *cb_args,
+ int * err) = 0;
+
+ ////////////////READERS////////////////////
+ /**
+ * gets the val associated with key.
+ *
+ * @param key the key to get
+ * @param val the value is stored in this
+ * @return error code
+ */
+ virtual int get(const string &key, bufferlist *val) = 0;
+
+ /**
+ * stores all keys in keys. set should put them in order by key.
+ */
+ virtual int get_all_keys(std::set<string> *keys) = 0;
+
+ /**
+ * stores all keys and values in kv_map. map should put them in order by key.
+ */
+ virtual int get_all_keys_and_values(map<string,bufferlist> *kv_map) = 0;
+
+ /**
+ * True if the structure meets its own requirements for consistency.
+ */
+ virtual bool is_consistent() = 0;
+
+ /**
+ * prints a string representation of the structure
+ */
+ virtual string str() = 0;
+};
+
+
+#endif /* KEY_VALUE_STRUCTURE_HPP_ */
diff --git a/src/key_value_store/kv_flat_btree_async.cc b/src/key_value_store/kv_flat_btree_async.cc
new file mode 100644
index 00000000..5aec8032
--- /dev/null
+++ b/src/key_value_store/kv_flat_btree_async.cc
@@ -0,0 +1,2346 @@
+/*
+ * Key-value store using librados
+ *
+ * September 2, 2012
+ * Eleanor Cawthon
+ * eleanor.cawthon@inktank.com
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "include/compat.h"
+#include "key_value_store/key_value_structure.h"
+#include "key_value_store/kv_flat_btree_async.h"
+#include "key_value_store/kvs_arg_types.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_context.h"
+#include "common/Clock.h"
+#include "include/types.h"
+
+#include <errno.h>
+#include <string>
+#include <iostream>
+#include <cassert>
+#include <climits>
+#include <cmath>
+#include <sstream>
+#include <stdlib.h>
+#include <iterator>
+
+using ceph::bufferlist;
+
+bool index_data::is_timed_out(utime_t now, utime_t timeout) const {
+ return prefix != "" && now - ts > timeout;
+}
+
+void IndexCache::clear() {
+ k2itmap.clear();
+ t2kmap.clear();
+}
+
+void IndexCache::push(const string &key, const index_data &idata) {
+ if (cache_size == 0) {
+ return;
+ }
+ index_data old_idata;
+ map<key_data, pair<index_data, utime_t> >::iterator old_it =
+ k2itmap.lower_bound(key_data(key));
+ if (old_it != k2itmap.end()) {
+ t2kmap.erase(old_it->second.second);
+ k2itmap.erase(old_it);
+ }
+ map<key_data, pair<index_data, utime_t> >::iterator new_it =
+ k2itmap.find(idata.kdata);
+ if (new_it != k2itmap.end()) {
+ utime_t old_time = new_it->second.second;
+ t2kmap.erase(old_time);
+ }
+ utime_t time = ceph_clock_now();
+ k2itmap[idata.kdata] = make_pair(idata, time);
+ t2kmap[time] = idata.kdata;
+ if ((int)k2itmap.size() > cache_size) {
+ pop();
+ }
+
+}
+
+void IndexCache::push(const index_data &idata) {
+ if (cache_size == 0) {
+ return;
+ }
+ if (k2itmap.count(idata.kdata) > 0) {
+ utime_t old_time = k2itmap[idata.kdata].second;
+ t2kmap.erase(old_time);
+ k2itmap.erase(idata.kdata);
+ }
+ utime_t time = ceph_clock_now();
+ k2itmap[idata.kdata] = make_pair(idata, time);
+ t2kmap[time] = idata.kdata;
+ if ((int)k2itmap.size() > cache_size) {
+ pop();
+ }
+}
+
+void IndexCache::pop() {
+ if (cache_size == 0) {
+ return;
+ }
+ map<utime_t, key_data>::iterator it = t2kmap.begin();
+ utime_t time = it->first;
+ key_data kdata = it->second;
+ k2itmap.erase(kdata);
+ t2kmap.erase(time);
+}
+
+void IndexCache::erase(key_data kdata) {
+ if (cache_size == 0) {
+ return;
+ }
+ if (k2itmap.count(kdata) > 0) {
+ utime_t c = k2itmap[kdata].second;
+ k2itmap.erase(kdata);
+ t2kmap.erase(c);
+ }
+}
+
+int IndexCache::get(const string &key, index_data *idata) const {
+ if (cache_size == 0) {
+ return -ENODATA;
+ }
+ if ((int)k2itmap.size() == 0) {
+ return -ENODATA;
+ }
+ map<key_data, pair<index_data, utime_t> >::const_iterator it =
+ k2itmap.lower_bound(key_data(key));
+ if (it == k2itmap.end() || !(it->second.first.min_kdata < key_data(key))) {
+ return -ENODATA;
+ } else {
+ *idata = it->second.first;
+ }
+ return 0;
+}
+
+int IndexCache::get(const string &key, index_data *idata,
+ index_data *next_idata) const {
+ if (cache_size == 0) {
+ return -ENODATA;
+ }
+ map<key_data, pair<index_data, utime_t> >::const_iterator it =
+ k2itmap.lower_bound(key_data(key));
+ if (it == k2itmap.end() || ++it == k2itmap.end()) {
+ return -ENODATA;
+ } else {
+ --it;
+ if (!(it->second.first.min_kdata < key_data(key))){
+ //stale, should be reread.
+ return -ENODATA;
+ } else {
+ *idata = it->second.first;
+ ++it;
+ if (it != k2itmap.end()) {
+ *next_idata = it->second.first;
+ }
+ }
+ }
+ return 0;
+}
+
+int KvFlatBtreeAsync::nothing() {
+ return 0;
+}
+
+int KvFlatBtreeAsync::wait() {
+ if (rand() % 10 == 0) {
+ usleep(wait_ms);
+ }
+ return 0;
+}
+
+int KvFlatBtreeAsync::suicide() {
+ if (rand() % 10 == 0) {
+ if (verbose) cout << client_name << " is suiciding" << std::endl;
+ return 1;
+ }
+ return 0;
+}
+
+int KvFlatBtreeAsync::next(const index_data &idata, index_data * out_data)
+{
+ if (verbose) cout << "\t\t" << client_name << "-next: finding next of "
+ << idata.str()
+ << std::endl;
+ int err = 0;
+ librados::ObjectReadOperation oro;
+ std::map<std::string, bufferlist> kvs;
+ oro.omap_get_vals2(idata.kdata.encoded(),1,&kvs, nullptr, &err);
+ err = io_ctx.operate(index_name, &oro, NULL);
+ if (err < 0){
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-next: getting index failed with error "
+ << err << std::endl;
+ return err;
+ }
+ if (!kvs.empty()) {
+ out_data->kdata.parse(kvs.begin()->first);
+ auto b = kvs.begin()->second.cbegin();
+ out_data->decode(b);
+ if (idata.is_timed_out(ceph_clock_now(), timeout)) {
+ if (verbose) cout << client_name << " THINKS THE OTHER CLIENT DIED."
+ << std::endl;
+ //the client died after deleting the object. clean up.
+ cleanup(idata, err);
+ }
+ } else {
+ err = -EOVERFLOW;
+ }
+ return err;
+}
+
+int KvFlatBtreeAsync::prev(const index_data &idata, index_data * out_data)
+{
+ if (verbose) cout << "\t\t" << client_name << "-prev: finding prev of "
+ << idata.str() << std::endl;
+ int err = 0;
+ bufferlist inbl;
+ idata_from_idata_args in_args;
+ in_args.idata = idata;
+ in_args.encode(inbl);
+ bufferlist outbl;
+ err = io_ctx.exec(index_name,"kvs", "get_prev_idata", inbl, outbl);
+ if (err < 0){
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-prev: getting index failed with error "
+ << err << std::endl;
+ if (idata.is_timed_out(ceph_clock_now(), timeout)) {
+ if (verbose) cout << client_name << " THINKS THE OTHER CLIENT DIED."
+ << std::endl;
+ //the client died after deleting the object. clean up.
+ err = cleanup(idata, err);
+ if (err == -ESUICIDE) {
+ return err;
+ } else {
+ err = 0;
+ }
+ }
+ return err;
+ }
+ auto it = outbl.cbegin();
+ in_args.decode(it);
+ *out_data = in_args.next_idata;
+ if (verbose) cout << "\t\t" << client_name << "-prev: prev is "
+ << out_data->str()
+ << std::endl;
+ return err;
+}
+
+int KvFlatBtreeAsync::read_index(const string &key, index_data * idata,
+ index_data * next_idata, bool force_update) {
+ int err = 0;
+ if (!force_update) {
+ if (verbose) cout << "\t" << client_name
+ << "-read_index: getting index_data for " << key
+ << " from cache" << std::endl;
+ icache_lock.Lock();
+ if (next_idata != NULL) {
+ err = icache.get(key, idata, next_idata);
+ } else {
+ err = icache.get(key, idata);
+ }
+ icache_lock.Unlock();
+
+ if (err == 0) {
+ //if (verbose) cout << "CACHE SUCCESS" << std::endl;
+ return err;
+ } else {
+ if (verbose) cout << "NOT IN CACHE" << std::endl;
+ }
+ }
+
+ if (verbose) cout << "\t" << client_name
+ << "-read_index: getting index_data for " << key
+ << " from object" << std::endl;
+ librados::ObjectReadOperation oro;
+ bufferlist raw_val;
+ std::set<std::string> key_set;
+ key_set.insert(key_data(key).encoded());
+ std::map<std::string, bufferlist> kvmap;
+ std::map<std::string, bufferlist> dupmap;
+ oro.omap_get_vals_by_keys(key_set, &dupmap, &err);
+ oro.omap_get_vals2(key_data(key).encoded(),
+ (cache_size / cache_refresh >= 2? cache_size / cache_refresh: 2),
+ &kvmap, nullptr, &err);
+ err = io_ctx.operate(index_name, &oro, NULL);
+ utime_t mytime = ceph_clock_now();
+ if (err < 0){
+ cerr << "\t" << client_name
+ << "-read_index: getting keys failed with "
+ << err << std::endl;
+ ceph_abort_msg(client_name + "-read_index: reading index failed");
+ return err;
+ }
+ kvmap.insert(dupmap.begin(), dupmap.end());
+ for (map<string, bufferlist>::iterator it = ++kvmap.begin();
+ it != kvmap.end();
+ ++it) {
+ bufferlist bl = it->second;
+ auto blit = bl.cbegin();
+ index_data this_idata;
+ this_idata.decode(blit);
+ if (this_idata.is_timed_out(mytime, timeout)) {
+ if (verbose) cout << client_name
+ << " THINKS THE OTHER CLIENT DIED. (mytime is "
+ << mytime.sec() << "." << mytime.usec() << ", idata.ts is "
+ << this_idata.ts.sec() << "." << this_idata.ts.usec()
+ << ", it has been " << (mytime - this_idata.ts).sec()
+ << '.' << (mytime - this_idata.ts).usec()
+ << ", timeout is " << timeout << ")" << std::endl;
+ //the client died after deleting the object. clean up.
+ if (cleanup(this_idata, -EPREFIX) == -ESUICIDE) {
+ return -ESUICIDE;
+ }
+ return read_index(key, idata, next_idata, force_update);
+ }
+ icache_lock.Lock();
+ icache.push(this_idata);
+ icache_lock.Unlock();
+ }
+ auto b = kvmap.begin()->second.cbegin();
+ idata->decode(b);
+ idata->kdata.parse(kvmap.begin()->first);
+ if (verbose) cout << "\t" << client_name << "-read_index: kvmap_size is "
+ << kvmap.size()
+ << ", idata is " << idata->str() << std::endl;
+
+ ceph_assert(idata->obj != "");
+ icache_lock.Lock();
+ icache.push(key, *idata);
+ icache_lock.Unlock();
+
+ if (next_idata != NULL && idata->kdata.prefix != "1") {
+ next_idata->kdata.parse((++kvmap.begin())->first);
+ auto nb = (++kvmap.begin())->second.cbegin();
+ next_idata->decode(nb);
+ icache_lock.Lock();
+ icache.push(*next_idata);
+ icache_lock.Unlock();
+ }
+ return err;
+}
+
+int KvFlatBtreeAsync::split(const index_data &idata) {
+ int err = 0;
+ opmap['l']++;
+
+ if (idata.prefix != "") {
+ return -EPREFIX;
+ }
+
+ rebalance_args args;
+ args.bound = 2 * k - 1;
+ args.comparator = CEPH_OSD_CMPXATTR_OP_GT;
+ err = read_object(idata.obj, &args);
+ args.odata.max_kdata = idata.kdata;
+ if (err < 0) {
+ if (verbose) cout << "\t\t" << client_name << "-split: read object "
+ << args.odata.name
+ << " got " << err << std::endl;
+ return err;
+ }
+
+ if (verbose) cout << "\t\t" << client_name << "-split: splitting "
+ << idata.obj
+ << ", which has size " << args.odata.size
+ << " and actual size " << args.odata.omap.size() << std::endl;
+
+ ///////preparations that happen outside the critical section
+ //for prefix index
+ vector<object_data> to_create;
+ vector<object_data> to_delete;
+ to_delete.push_back(object_data(idata.min_kdata,
+ args.odata.max_kdata, args.odata.name, args.odata.version));
+
+ //for lower half object
+ map<std::string, bufferlist>::const_iterator it = args.odata.omap.begin();
+ client_index_lock.Lock();
+ to_create.push_back(object_data(to_string(client_name, client_index++)));
+ client_index_lock.Unlock();
+ for (int i = 0; i < k; i++) {
+ to_create[0].omap.insert(*it);
+ ++it;
+ }
+ to_create[0].min_kdata = idata.min_kdata;
+ to_create[0].max_kdata = key_data(to_create[0].omap.rbegin()->first);
+
+ //for upper half object
+ client_index_lock.Lock();
+ to_create.push_back(object_data(to_create[0].max_kdata,
+ args.odata.max_kdata,
+ to_string(client_name, client_index++)));
+ client_index_lock.Unlock();
+ to_create[1].omap.insert(
+ ++args.odata.omap.find(to_create[0].omap.rbegin()->first),
+ args.odata.omap.end());
+
+ //setting up operations
+ librados::ObjectWriteOperation owos[6];
+ vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > ops;
+ index_data out_data;
+ set_up_prefix_index(to_create, to_delete, &owos[0], &out_data, &err);
+ ops.push_back(make_pair(
+ pair<int, string>(ADD_PREFIX, index_name),
+ &owos[0]));
+ for (int i = 1; i < 6; i++) {
+ ops.push_back(make_pair(make_pair(0,""), &owos[i]));
+ }
+ set_up_ops(to_create, to_delete, &ops, out_data, &err);
+
+ /////BEGIN CRITICAL SECTION/////
+ //put prefix on index entry for idata.val
+ err = perform_ops("\t\t" + client_name + "-split:", out_data, &ops);
+ if (err < 0) {
+ return err;
+ }
+ if (verbose) cout << "\t\t" << client_name << "-split: done splitting."
+ << std::endl;
+ /////END CRITICAL SECTION/////
+ icache_lock.Lock();
+ for (vector<delete_data>::iterator it = out_data.to_delete.begin();
+ it != out_data.to_delete.end(); ++it) {
+ icache.erase(it->max);
+ }
+ for (vector<create_data>::iterator it = out_data.to_create.begin();
+ it != out_data.to_create.end(); ++it) {
+ icache.push(index_data(*it));
+ }
+ icache_lock.Unlock();
+ return err;
+}
+
+int KvFlatBtreeAsync::rebalance(const index_data &idata1,
+ const index_data &next_idata){
+ opmap['m']++;
+ int err = 0;
+
+ if (idata1.prefix != "") {
+ return -EPREFIX;
+ }
+
+ rebalance_args args1;
+ args1.bound = k + 1;
+ args1.comparator = CEPH_OSD_CMPXATTR_OP_LT;
+ index_data idata2 = next_idata;
+
+ rebalance_args args2;
+ args2.bound = k + 1;
+ args2.comparator = CEPH_OSD_CMPXATTR_OP_LT;
+
+ if (idata1.kdata.prefix == "1") {
+ //this is the highest key in the index, so it doesn't have a next.
+
+ //read the index for the previous entry
+ err = prev(idata1, &idata2);
+ if (err == -ERANGE) {
+ if (verbose) cout << "\t\t" << client_name
+ << "-rebalance: this is the only node, "
+ << "so aborting" << std::endl;
+ return -EUCLEAN;
+ } else if (err < 0) {
+ return err;
+ }
+
+ //read the first object
+ err = read_object(idata1.obj, &args2);
+ if (err < 0) {
+ if (verbose) cout << "reading " << idata1.obj << " failed with " << err
+ << std::endl;
+ if (err == -ENOENT) {
+ return -ECANCELED;
+ }
+ return err;
+ }
+ args2.odata.min_kdata = idata1.min_kdata;
+ args2.odata.max_kdata = idata1.kdata;
+
+ //read the second object
+ args1.bound = 2 * k + 1;
+ err = read_object(idata2.obj, &args1);
+ if (err < 0) {
+ if (verbose) cout << "reading " << idata1.obj << " failed with " << err
+ << std::endl;
+ return err;
+ }
+ args1.odata.min_kdata = idata2.min_kdata;
+ args1.odata.max_kdata = idata2.kdata;
+
+ if (verbose) cout << "\t\t" << client_name << "-rebalance: read "
+ << idata2.obj
+ << ". size: " << args1.odata.size << " version: "
+ << args1.odata.version
+ << std::endl;
+ } else {
+ assert (next_idata.obj != "");
+ //there is a next key, so get it.
+ err = read_object(idata1.obj, &args1);
+ if (err < 0) {
+ if (verbose) cout << "reading " << idata1.obj << " failed with " << err
+ << std::endl;
+ return err;
+ }
+ args1.odata.min_kdata = idata1.min_kdata;
+ args1.odata.max_kdata = idata1.kdata;
+
+ args2.bound = 2 * k + 1;
+ err = read_object(idata2.obj, &args2);
+ if (err < 0) {
+ if (verbose) cout << "reading " << idata1.obj << " failed with " << err
+ << std::endl;
+ if (err == -ENOENT) {
+ return -ECANCELED;
+ }
+ return err;
+ }
+ args2.odata.min_kdata = idata2.min_kdata;
+ args2.odata.max_kdata = idata2.kdata;
+
+ if (verbose) cout << "\t\t" << client_name << "-rebalance: read "
+ << idata2.obj
+ << ". size: " << args2.odata.size << " version: "
+ << args2.odata.version
+ << std::endl;
+ }
+
+ if (verbose) cout << "\t\t" << client_name << "-rebalance: o1 is "
+ << args1.odata.max_kdata.encoded() << ","
+ << args1.odata.name << " with size " << args1.odata.size
+ << " , o2 is " << args2.odata.max_kdata.encoded()
+ << "," << args2.odata.name << " with size " << args2.odata.size
+ << std::endl;
+
+ //calculations
+ if ((int)args1.odata.size > k && (int)args1.odata.size <= 2*k
+ && (int)args2.odata.size > k
+ && (int)args2.odata.size <= 2*k) {
+ //nothing to do
+ if (verbose) cout << "\t\t" << client_name
+ << "-rebalance: both sizes in range, so"
+ << " aborting " << std::endl;
+ return -EBALANCE;
+ } else if (idata1.prefix != "" || idata2.prefix != "") {
+ return -EPREFIX;
+ }
+
+ //this is the high object. it gets created regardless of rebalance or merge.
+ client_index_lock.Lock();
+ string o2w = to_string(client_name, client_index++);
+ client_index_lock.Unlock();
+ index_data idata;
+ vector<object_data> to_create;
+ vector<object_data> to_delete;
+ librados::ObjectWriteOperation create[2];//possibly only 1 will be used
+ librados::ObjectWriteOperation other_ops[6];
+ vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > ops;
+ ops.push_back(make_pair(
+ pair<int, string>(ADD_PREFIX, index_name),
+ &other_ops[0]));
+
+ if ((int)args1.odata.size + (int)args2.odata.size <= 2*k) {
+ //merge
+ if (verbose) cout << "\t\t" << client_name << "-rebalance: merging "
+ << args1.odata.name
+ << " and " << args2.odata.name << " to get " << o2w
+ << std::endl;
+ map<string, bufferlist> write2_map;
+ write2_map.insert(args1.odata.omap.begin(), args1.odata.omap.end());
+ write2_map.insert(args2.odata.omap.begin(), args2.odata.omap.end());
+ to_create.push_back(object_data(args1.odata.min_kdata,
+ args2.odata.max_kdata, o2w, write2_map));
+ ops.push_back(make_pair(
+ pair<int, string>(MAKE_OBJECT, o2w),
+ &create[0]));
+ ceph_assert((int)write2_map.size() <= 2*k);
+ } else {
+ //rebalance
+ if (verbose) cout << "\t\t" << client_name << "-rebalance: rebalancing "
+ << args1.odata.name
+ << " and " << args2.odata.name << std::endl;
+ map<std::string, bufferlist> write1_map;
+ map<std::string, bufferlist> write2_map;
+ map<std::string, bufferlist>::iterator it;
+ client_index_lock.Lock();
+ string o1w = to_string(client_name, client_index++);
+ client_index_lock.Unlock();
+ int target_size_1 = ceil(((int)args1.odata.size + (int)args2.odata.size)
+ / 2.0);
+ if (args1.odata.max_kdata != idata1.kdata) {
+ //this should be true if idata1 is the high object
+ target_size_1 = floor(((int)args1.odata.size + (int)args2.odata.size)
+ / 2.0);
+ }
+ for (it = args1.odata.omap.begin();
+ it != args1.odata.omap.end() && (int)write1_map.size()
+ < target_size_1;
+ ++it) {
+ write1_map.insert(*it);
+ }
+ if (it != args1.odata.omap.end()){
+ //write1_map is full, so put the rest in write2_map
+ write2_map.insert(it, args1.odata.omap.end());
+ write2_map.insert(args2.odata.omap.begin(), args2.odata.omap.end());
+ } else {
+ //args1.odata.omap was small, and write2_map still needs more
+ map<std::string, bufferlist>::iterator it2;
+ for(it2 = args2.odata.omap.begin();
+ (it2 != args2.odata.omap.end()) && ((int)write1_map.size()
+ < target_size_1);
+ ++it2) {
+ write1_map.insert(*it2);
+ }
+ write2_map.insert(it2, args2.odata.omap.end());
+ }
+ if (verbose) cout << "\t\t" << client_name
+ << "-rebalance: write1_map has size "
+ << write1_map.size() << ", write2_map.size() is " << write2_map.size()
+ << std::endl;
+ //at this point, write1_map and write2_map should have the correct pairs
+ to_create.push_back(object_data(args1.odata.min_kdata,
+ key_data(write1_map.rbegin()->first),
+ o1w,write1_map));
+ to_create.push_back(object_data( key_data(write1_map.rbegin()->first),
+ args2.odata.max_kdata, o2w, write2_map));
+ ops.push_back(make_pair(
+ pair<int, string>(MAKE_OBJECT, o1w),
+ &create[0]));
+ ops.push_back(make_pair(
+ pair<int, string>(MAKE_OBJECT, o2w),
+ &create[1]));
+ }
+
+ to_delete.push_back(object_data(args1.odata.min_kdata,
+ args1.odata.max_kdata, args1.odata.name, args1.odata.version));
+ to_delete.push_back(object_data(args2.odata.min_kdata,
+ args2.odata.max_kdata, args2.odata.name, args2.odata.version));
+ for (int i = 1; i < 6; i++) {
+ ops.push_back(make_pair(make_pair(0,""), &other_ops[i]));
+ }
+
+ index_data out_data;
+ set_up_prefix_index(to_create, to_delete, &other_ops[0], &out_data, &err);
+ set_up_ops(to_create, to_delete, &ops, out_data, &err);
+
+ //at this point, all operations should be completely set up.
+ /////BEGIN CRITICAL SECTION/////
+ err = perform_ops("\t\t" + client_name + "-rebalance:", out_data, &ops);
+ if (err < 0) {
+ return err;
+ }
+ icache_lock.Lock();
+ for (vector<delete_data>::iterator it = out_data.to_delete.begin();
+ it != out_data.to_delete.end(); ++it) {
+ icache.erase(it->max);
+ }
+ for (vector<create_data>::iterator it = out_data.to_create.begin();
+ it != out_data.to_create.end(); ++it) {
+ icache.push(index_data(*it));
+ }
+ icache_lock.Unlock();
+ if (verbose) cout << "\t\t" << client_name << "-rebalance: done rebalancing."
+ << std::endl;
+ /////END CRITICAL SECTION/////
+ return err;
+}
+
+int KvFlatBtreeAsync::read_object(const string &obj, object_data * odata) {
+ librados::ObjectReadOperation get_obj;
+ librados::AioCompletion * obj_aioc = rados.aio_create_completion();
+ int err;
+ bufferlist unw_bl;
+ odata->name = obj;
+ get_obj.omap_get_vals2("", LONG_MAX, &odata->omap, nullptr, &err);
+ get_obj.getxattr("unwritable", &unw_bl, &err);
+ io_ctx.aio_operate(obj, obj_aioc, &get_obj, NULL);
+ obj_aioc->wait_for_safe();
+ err = obj_aioc->get_return_value();
+ if (err < 0){
+ //possibly -ENOENT, meaning someone else deleted it.
+ obj_aioc->release();
+ return err;
+ }
+ odata->unwritable = string(unw_bl.c_str(), unw_bl.length()) == "1";
+ odata->version = obj_aioc->get_version64();
+ odata->size = odata->omap.size();
+ obj_aioc->release();
+ return 0;
+}
+
+int KvFlatBtreeAsync::read_object(const string &obj, rebalance_args * args) {
+ bufferlist inbl;
+ args->encode(inbl);
+ bufferlist outbl;
+ int err;
+ librados::AioCompletion * a = rados.aio_create_completion();
+ io_ctx.aio_exec(obj, a, "kvs", "maybe_read_for_balance", inbl, &outbl);
+ a->wait_for_safe();
+ err = a->get_return_value();
+ if (err < 0) {
+ if (verbose) cout << "\t\t" << client_name
+ << "-read_object: reading failed with "
+ << err << std::endl;
+ a->release();
+ return err;
+ }
+ auto it = outbl.cbegin();
+ args->decode(it);
+ args->odata.name = obj;
+ args->odata.version = a->get_version64();
+ a->release();
+ return err;
+}
+
+void KvFlatBtreeAsync::set_up_prefix_index(
+ const vector<object_data> &to_create,
+ const vector<object_data> &to_delete,
+ librados::ObjectWriteOperation * owo,
+ index_data * idata,
+ int * err) {
+ std::map<std::string, pair<bufferlist, int> > assertions;
+ map<string, bufferlist> to_insert;
+ idata->prefix = "1";
+ idata->ts = ceph_clock_now();
+ for(vector<object_data>::const_iterator it = to_create.begin();
+ it != to_create.end();
+ ++it) {
+ create_data c(it->min_kdata, it->max_kdata, it->name);
+ idata->to_create.push_back(c);
+ }
+ for(vector<object_data>::const_iterator it = to_delete.begin();
+ it != to_delete.end();
+ ++it) {
+ delete_data d(it->min_kdata, it->max_kdata, it->name, it->version);
+ idata->to_delete.push_back(d);
+ }
+ for(vector<object_data>::const_iterator it = to_delete.begin();
+ it != to_delete.end();
+ ++it) {
+ idata->obj = it->name;
+ idata->min_kdata = it->min_kdata;
+ idata->kdata = it->max_kdata;
+ bufferlist insert;
+ idata->encode(insert);
+ to_insert[it->max_kdata.encoded()] = insert;
+ index_data this_entry;
+ this_entry.min_kdata = idata->min_kdata;
+ this_entry.kdata = idata->kdata;
+ this_entry.obj = idata->obj;
+ assertions[it->max_kdata.encoded()] = pair<bufferlist, int>
+ (to_bl(this_entry), CEPH_OSD_CMPXATTR_OP_EQ);
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-setup_prefix: will assert "
+ << this_entry.str() << std::endl;
+ }
+ ceph_assert(*err == 0);
+ owo->omap_cmp(assertions, err);
+ if (to_create.size() <= 2) {
+ owo->omap_set(to_insert);
+ }
+}
+
+//some args can be null if there are no corresponding entries in p
+void KvFlatBtreeAsync::set_up_ops(
+ const vector<object_data> &create_vector,
+ const vector<object_data> &delete_vector,
+ vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > * ops,
+ const index_data &idata,
+ int * err) {
+ vector<pair<pair<int, string>,
+ librados::ObjectWriteOperation* > >::iterator it;
+
+ //skip the prefixing part
+ for(it = ops->begin(); it->first.first == ADD_PREFIX; ++it) {}
+ map<string, bufferlist> to_insert;
+ std::set<string> to_remove;
+ map<string, pair<bufferlist, int> > assertions;
+ if (create_vector.size() > 0) {
+ for (int i = 0; i < (int)idata.to_delete.size(); ++i) {
+ it->first = pair<int, string>(UNWRITE_OBJECT, idata.to_delete[i].obj);
+ set_up_unwrite_object(delete_vector[i].version, it->second);
+ ++it;
+ }
+ }
+ for (int i = 0; i < (int)idata.to_create.size(); ++i) {
+ index_data this_entry(idata.to_create[i].max, idata.to_create[i].min,
+ idata.to_create[i].obj);
+ to_insert[idata.to_create[i].max.encoded()] = to_bl(this_entry);
+ if (idata.to_create.size() <= 2) {
+ it->first = pair<int, string>(MAKE_OBJECT, idata.to_create[i].obj);
+ } else {
+ it->first = pair<int, string>(AIO_MAKE_OBJECT, idata.to_create[i].obj);
+ }
+ set_up_make_object(create_vector[i].omap, it->second);
+ ++it;
+ }
+ for (int i = 0; i < (int)idata.to_delete.size(); ++i) {
+ index_data this_entry = idata;
+ this_entry.obj = idata.to_delete[i].obj;
+ this_entry.min_kdata = idata.to_delete[i].min;
+ this_entry.kdata = idata.to_delete[i].max;
+ if (verbose) cout << "\t\t\t" << client_name << "-setup_ops: will assert "
+ << this_entry.str() << std::endl;
+ assertions[idata.to_delete[i].max.encoded()] = pair<bufferlist, int>(
+ to_bl(this_entry), CEPH_OSD_CMPXATTR_OP_EQ);
+ to_remove.insert(idata.to_delete[i].max.encoded());
+ it->first = pair<int, string>(REMOVE_OBJECT, idata.to_delete[i].obj);
+ set_up_delete_object(it->second);
+ ++it;
+ }
+ if ((int)idata.to_create.size() <= 2) {
+ it->second->omap_cmp(assertions, err);
+ }
+ it->second->omap_rm_keys(to_remove);
+ it->second->omap_set(to_insert);
+
+
+ it->first = pair<int, string>(REMOVE_PREFIX, index_name);
+}
+
+void KvFlatBtreeAsync::set_up_make_object(
+ const map<std::string, bufferlist> &to_set,
+ librados::ObjectWriteOperation *owo) {
+ bufferlist inbl;
+ encode(to_set, inbl);
+ owo->exec("kvs", "create_with_omap", inbl);
+}
+
+void KvFlatBtreeAsync::set_up_unwrite_object(
+ const int &ver, librados::ObjectWriteOperation *owo) {
+ if (ver > 0) {
+ owo->assert_version(ver);
+ }
+ owo->cmpxattr("unwritable", CEPH_OSD_CMPXATTR_OP_EQ, to_bl("0"));
+ owo->setxattr("unwritable", to_bl("1"));
+}
+
+void KvFlatBtreeAsync::set_up_restore_object(
+ librados::ObjectWriteOperation *owo) {
+ owo->cmpxattr("unwritable", CEPH_OSD_CMPXATTR_OP_EQ, to_bl("1"));
+ owo->setxattr("unwritable", to_bl("0"));
+}
+
+void KvFlatBtreeAsync::set_up_delete_object(
+ librados::ObjectWriteOperation *owo) {
+ owo->cmpxattr("unwritable", CEPH_OSD_CMPXATTR_OP_EQ, to_bl("1"));
+ owo->remove();
+}
+
+int KvFlatBtreeAsync::perform_ops(const string &debug_prefix,
+ const index_data &idata,
+ vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > *ops) {
+ int err = 0;
+ vector<librados::AioCompletion*> aiocs(idata.to_create.size());
+ int count = 0;
+ for (vector<pair<pair<int, string>,
+ librados::ObjectWriteOperation*> >::iterator it = ops->begin();
+ it != ops->end(); ++it) {
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 ) {
+ return -ESUICIDE;
+ }
+ switch (it->first.first) {
+ case ADD_PREFIX://prefixing
+ if (verbose) cout << debug_prefix << " adding prefix" << std::endl;
+ err = io_ctx.operate(index_name, it->second);
+ if (err < 0) {
+ if (verbose) cout << debug_prefix << " prefixing the index failed with "
+ << err << std::endl;
+ return -EPREFIX;
+ }
+ if (verbose) cout << debug_prefix << " prefix added." << std::endl;
+ break;
+ case UNWRITE_OBJECT://marking
+ if (verbose) cout << debug_prefix << " marking " << it->first.second
+ << std::endl;
+ err = io_ctx.operate(it->first.second, it->second);
+ if (err < 0) {
+ //most likely because it changed, in which case it will be -ERANGE
+ if (verbose) cout << debug_prefix << " marking " << it->first.second
+ << "failed with code" << err << std::endl;
+ if (it->first.second == (*idata.to_delete.begin()).max.encoded()) {
+ if (cleanup(idata, -EFIRSTOBJ) == -ESUICIDE) {
+ return -ESUICIDE;
+ }
+ } else {
+ if (cleanup(idata, -ERANGE) == -ESUICIDE) {
+ return -ESUICIDE;
+ }
+ }
+ return err;
+ }
+ if (verbose) cout << debug_prefix << " marked " << it->first.second
+ << std::endl;
+ break;
+ case MAKE_OBJECT://creating
+ if (verbose) cout << debug_prefix << " creating " << it->first.second
+ << std::endl;
+ err = io_ctx.operate(it->first.second, it->second);
+ if (err < 0) {
+ //this can happen if someone else was cleaning up after us.
+ if (verbose) cout << debug_prefix << " creating " << it->first.second
+ << " failed"
+ << " with code " << err << std::endl;
+ if (err == -EEXIST) {
+ //someone thinks we died, so die
+ if (verbose) cout << client_name << " is suiciding!" << std::endl;
+ return -ESUICIDE;
+ } else {
+ ceph_abort();
+ }
+ return err;
+ }
+ if (verbose || idata.to_create.size() > 2) {
+ cout << debug_prefix << " created object " << it->first.second
+ << std::endl;
+ }
+ break;
+ case AIO_MAKE_OBJECT:
+ cout << debug_prefix << " launching asynchronous create "
+ << it->first.second << std::endl;
+ aiocs[count] = rados.aio_create_completion();
+ io_ctx.aio_operate(it->first.second, aiocs[count], it->second);
+ count++;
+ if ((int)idata.to_create.size() == count) {
+ cout << "starting aiowrite waiting loop" << std::endl;
+ for (count -= 1; count >= 0; count--) {
+ aiocs[count]->wait_for_safe();
+ err = aiocs[count]->get_return_value();
+ if (err < 0) {
+ //this can happen if someone else was cleaning up after us.
+ cerr << debug_prefix << " a create failed"
+ << " with code " << err << std::endl;
+ if (err == -EEXIST) {
+ //someone thinks we died, so die
+ cerr << client_name << " is suiciding!" << std::endl;
+ return -ESUICIDE;
+ } else {
+ ceph_abort();
+ }
+ return err;
+ }
+ if (verbose || idata.to_create.size() > 2) {
+ cout << debug_prefix << " completed aio " << aiocs.size() - count
+ << "/" << aiocs.size() << std::endl;
+ }
+ }
+ }
+ break;
+ case REMOVE_OBJECT://deleting
+ if (verbose) cout << debug_prefix << " deleting " << it->first.second
+ << std::endl;
+ err = io_ctx.operate(it->first.second, it->second);
+ if (err < 0) {
+ //if someone else called cleanup on this prefix first
+ if (verbose) cout << debug_prefix << " deleting " << it->first.second
+ << "failed with code" << err << std::endl;
+ }
+ if (verbose) cout << debug_prefix << " deleted " << it->first.second
+ << std::endl;
+ break;
+ case REMOVE_PREFIX://rewriting index
+ if (verbose) cout << debug_prefix << " updating index " << std::endl;
+ err = io_ctx.operate(index_name, it->second);
+ if (err < 0) {
+ if (verbose) cout << debug_prefix
+ << " rewriting the index failed with code " << err
+ << ". someone else must have thought we died, so dying" << std::endl;
+ return -ETIMEDOUT;
+ }
+ if (verbose) cout << debug_prefix << " updated index." << std::endl;
+ break;
+ case RESTORE_OBJECT:
+ if (verbose) cout << debug_prefix << " restoring " << it->first.second
+ << std::endl;
+ err = io_ctx.operate(it->first.second, it->second);
+ if (err < 0) {
+ if (verbose) cout << debug_prefix << "restoring " << it->first.second
+ << " failed"
+ << " with " << err << std::endl;
+ return err;
+ }
+ if (verbose) cout << debug_prefix << " restored " << it->first.second
+ << std::endl;
+ break;
+ default:
+ if (verbose) cout << debug_prefix << " performing unknown op on "
+ << it->first.second
+ << std::endl;
+ err = io_ctx.operate(index_name, it->second);
+ if (err < 0) {
+ if (verbose) cout << debug_prefix << " unknown op on "
+ << it->first.second
+ << " failed with " << err << std::endl;
+ return err;
+ }
+ if (verbose) cout << debug_prefix << " unknown op on "
+ << it->first.second
+ << " succeeded." << std::endl;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int KvFlatBtreeAsync::cleanup(const index_data &idata, const int &error) {
+ if (verbose) cout << "\t\t" << client_name << ": cleaning up after "
+ << idata.str()
+ << std::endl;
+ int err = 0;
+ ceph_assert(idata.prefix != "");
+ map<std::string,bufferlist> new_index;
+ map<std::string, pair<bufferlist, int> > assertions;
+ switch (error) {
+ case -EFIRSTOBJ: {
+ //this happens if the split or rebalance failed to mark the first object,
+ //meaning only the index needs to be changed.
+ //restore objects that had been marked unwritable.
+ for(vector<delete_data >::const_iterator it =
+ idata.to_delete.begin();
+ it != idata.to_delete.end(); ++it) {
+ index_data this_entry;
+ this_entry.obj = (*it).obj;
+ this_entry.min_kdata = it->min;
+ this_entry.kdata = it->max;
+ new_index[it->max.encoded()] = to_bl(this_entry);
+ this_entry = idata;
+ this_entry.obj = it->obj;
+ this_entry.min_kdata = it->min;
+ this_entry.kdata = it->max;
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: will assert index contains "
+ << this_entry.str() << std::endl;
+ assertions[it->max.encoded()] =
+ pair<bufferlist, int>(to_bl(this_entry),
+ CEPH_OSD_CMPXATTR_OP_EQ);
+ }
+
+ //update the index
+ librados::ObjectWriteOperation update_index;
+ update_index.omap_cmp(assertions, &err);
+ update_index.omap_set(new_index);
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: updating index"
+ << std::endl;
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 ) {
+ return -ESUICIDE;
+ }
+ err = io_ctx.operate(index_name, &update_index);
+ if (err < 0) {
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: rewriting failed with "
+ << err << ". returning -ECANCELED" << std::endl;
+ return -ECANCELED;
+ }
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: updated index. cleanup done."
+ << std::endl;
+ break;
+ }
+ case -ERANGE: {
+ //this happens if a split or rebalance fails to mark an object. It is a
+ //special case of rolling back that does not have to deal with new objects.
+
+ //restore objects that had been marked unwritable.
+ vector<delete_data >::const_iterator it;
+ for(it = idata.to_delete.begin();
+ it != idata.to_delete.end(); ++it) {
+ index_data this_entry;
+ this_entry.obj = (*it).obj;
+ this_entry.min_kdata = it->min;
+ this_entry.kdata = it->max;
+ new_index[it->max.encoded()] = to_bl(this_entry);
+ this_entry = idata;
+ this_entry.obj = it->obj;
+ this_entry.min_kdata = it->min;
+ this_entry.kdata = it->max;
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: will assert index contains "
+ << this_entry.str() << std::endl;
+ assertions[it->max.encoded()] =
+ pair<bufferlist, int>(to_bl(this_entry),
+ CEPH_OSD_CMPXATTR_OP_EQ);
+ }
+ it = idata.to_delete.begin();
+ librados::ObjectWriteOperation restore;
+ set_up_restore_object(&restore);
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 ) {
+ return -ESUICIDE;
+ }
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: restoring "
+ << it->obj
+ << std::endl;
+ err = io_ctx.operate(it->obj, &restore);
+ if (err < 0) {
+ //i.e., -ECANCELED because the object was already restored by someone
+ //else
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: restoring "
+ << it->obj
+ << " failed with " << err << std::endl;
+ } else {
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: restored "
+ << it->obj
+ << std::endl;
+ }
+
+ //update the index
+ librados::ObjectWriteOperation update_index;
+ update_index.omap_cmp(assertions, &err);
+ update_index.omap_set(new_index);
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: updating index"
+ << std::endl;
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 ) {
+ return -ESUICIDE;
+ }
+ err = io_ctx.operate(index_name, &update_index);
+ if (err < 0) {
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: rewriting failed with "
+ << err << ". returning -ECANCELED" << std::endl;
+ return -ECANCELED;
+ }
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: updated index. cleanup done."
+ << std::endl;
+ break;
+ }
+ case -ENOENT: {
+ if (verbose) cout << "\t\t" << client_name << "-cleanup: rolling forward"
+ << std::endl;
+ //all changes were created except for updating the index and possibly
+ //deleting the objects. roll forward.
+ vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > ops;
+ vector<librados::ObjectWriteOperation> owos(idata.to_delete.size() + 1);
+ for (int i = 0; i <= (int)idata.to_delete.size(); ++i) {
+ ops.push_back(make_pair(pair<int, string>(0, ""), &owos[i]));
+ }
+ set_up_ops(vector<object_data>(),
+ vector<object_data>(), &ops, idata, &err);
+ err = perform_ops("\t\t" + client_name + "-cleanup:", idata, &ops);
+ if (err < 0) {
+ if (err == -ESUICIDE) {
+ return -ESUICIDE;
+ }
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: rewriting failed with "
+ << err << ". returning -ECANCELED" << std::endl;
+ return -ECANCELED;
+ }
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: updated index"
+ << std::endl;
+ break;
+ }
+ default: {
+ //roll back all changes.
+ if (verbose) cout << "\t\t" << client_name << "-cleanup: rolling back"
+ << std::endl;
+ map<std::string,bufferlist> new_index;
+ std::set<string> to_remove;
+ map<std::string, pair<bufferlist, int> > assertions;
+
+ //mark the objects to be created. if someone else already has, die.
+ for(vector<create_data >::const_reverse_iterator it =
+ idata.to_create.rbegin();
+ it != idata.to_create.rend(); ++it) {
+ librados::ObjectWriteOperation rm;
+ set_up_unwrite_object(0, &rm);
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 )
+ {
+ return -ESUICIDE;
+ }
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: marking "
+ << it->obj
+ << std::endl;
+ err = io_ctx.operate(it->obj, &rm);
+ if (err < 0) {
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: marking "
+ << it->obj
+ << " failed with " << err << std::endl;
+ } else {
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: marked "
+ << it->obj
+ << std::endl;
+ }
+ }
+
+ //restore objects that had been marked unwritable.
+ for(vector<delete_data >::const_iterator it =
+ idata.to_delete.begin();
+ it != idata.to_delete.end(); ++it) {
+ index_data this_entry;
+ this_entry.obj = (*it).obj;
+ this_entry.min_kdata = it->min;
+ this_entry.kdata = it->max;
+ new_index[it->max.encoded()] = to_bl(this_entry);
+ this_entry = idata;
+ this_entry.obj = it->obj;
+ this_entry.min_kdata = it->min;
+ this_entry.kdata = it->max;
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: will assert index contains "
+ << this_entry.str() << std::endl;
+ assertions[it->max.encoded()] =
+ pair<bufferlist, int>(to_bl(this_entry),
+ CEPH_OSD_CMPXATTR_OP_EQ);
+ librados::ObjectWriteOperation restore;
+ set_up_restore_object(&restore);
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: will assert index contains "
+ << this_entry.str() << std::endl;
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 )
+ {
+ return -ESUICIDE;
+ }
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: restoring "
+ << it->obj
+ << std::endl;
+ err = io_ctx.operate(it->obj, &restore);
+ if (err == -ENOENT) {
+ //it had gotten far enough to be rolled forward - unmark the objects
+ //and roll forward.
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: roll forward instead"
+ << std::endl;
+ for(vector<create_data >::const_iterator cit =
+ idata.to_create.begin();
+ cit != idata.to_create.end(); ++cit) {
+ librados::ObjectWriteOperation res;
+ set_up_restore_object(&res);
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)()
+ == 1 ) {
+ return -ECANCELED;
+ }
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: restoring " << cit->obj
+ << std::endl;
+ err = io_ctx.operate(cit->obj, &res);
+ if (err < 0) {
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: restoring "
+ << cit->obj << " failed with " << err << std::endl;
+ }
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: restored "
+ << cit->obj
+ << std::endl;
+ }
+ return cleanup(idata, -ENOENT);
+ } else if (err < 0) {
+ //i.e., -ECANCELED because the object was already restored by someone
+ //else
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: restoring " << it->obj
+ << " failed with " << err << std::endl;
+ } else {
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: restored "
+ << it->obj
+ << std::endl;
+ }
+ }
+
+ //remove the new objects
+ for(vector<create_data >::const_reverse_iterator it =
+ idata.to_create.rbegin();
+ it != idata.to_create.rend(); ++it) {
+ to_remove.insert(it->max.encoded());
+ librados::ObjectWriteOperation rm;
+ rm.remove();
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 )
+ {
+ return -ESUICIDE;
+ }
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: removing "
+ << it->obj
+ << std::endl;
+ err = io_ctx.operate(it->obj, &rm);
+ if (err < 0) {
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: failed to remove "
+ << it->obj << std::endl;
+ } else {
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: removed "
+ << it->obj
+ << std::endl;
+ }
+ }
+
+ //update the index
+ librados::ObjectWriteOperation update_index;
+ update_index.omap_cmp(assertions, &err);
+ update_index.omap_rm_keys(to_remove);
+ update_index.omap_set(new_index);
+ if (verbose) cout << "\t\t\t" << client_name << "-cleanup: updating index"
+ << std::endl;
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 ) {
+ return -ESUICIDE;
+ }
+ err = io_ctx.operate(index_name, &update_index);
+ if (err < 0) {
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: rewriting failed with "
+ << err << ". returning -ECANCELED" << std::endl;
+ return -ECANCELED;
+ }
+ if (verbose) cout << "\t\t\t" << client_name
+ << "-cleanup: updated index. cleanup done."
+ << std::endl;
+ break;
+ }
+ }
+ return err;
+}
+
+string KvFlatBtreeAsync::to_string(string s, int i) {
+ stringstream ret;
+ ret << s << i;
+ return ret.str();
+}
+
+string KvFlatBtreeAsync::get_name() {
+ return rados_id;
+}
+
+void KvFlatBtreeAsync::set_inject(injection_t inject, int wait_time) {
+ interrupt = inject;
+ wait_ms = wait_time;
+}
+
+int KvFlatBtreeAsync::setup(int argc, const char** argv) {
+ int r = rados.init(rados_id.c_str());
+ if (r < 0) {
+ cerr << "error during init" << r << std::endl;
+ return r;
+ }
+ r = rados.conf_parse_argv(argc, argv);
+ if (r < 0) {
+ cerr << "error during parsing args" << r << std::endl;
+ return r;
+ }
+ r = rados.conf_parse_env(NULL);
+ if (r < 0) {
+ cerr << "error during parsing env" << r << std::endl;
+ return r;
+ }
+ r = rados.conf_read_file(NULL);
+ if (r < 0) {
+ cerr << "error during read file: " << r << std::endl;
+ return r;
+ }
+ r = rados.connect();
+ if (r < 0) {
+ cerr << "error during connect: " << r << std::endl;
+ return r;
+ }
+ r = rados.ioctx_create(pool_name.c_str(), io_ctx);
+ if (r < 0) {
+ cerr << "error creating io ctx: " << r << std::endl;
+ rados.shutdown();
+ return r;
+ }
+
+ librados::ObjectWriteOperation make_index;
+ make_index.create(true);
+ map<std::string,bufferlist> index_map;
+ index_data idata;
+ idata.obj = client_name;
+ idata.min_kdata.raw_key = "";
+ idata.kdata = key_data("");
+ index_map["1"] = to_bl(idata);
+ make_index.omap_set(index_map);
+ r = io_ctx.operate(index_name, &make_index);
+ if (r < 0) {
+ if (verbose) cout << client_name << ": Making the index failed with code "
+ << r
+ << std::endl;
+ return 0;
+ }
+ if (verbose) cout << client_name << ": created index object" << std::endl;
+
+ librados::ObjectWriteOperation make_max_obj;
+ make_max_obj.create(true);
+ make_max_obj.setxattr("unwritable", to_bl("0"));
+ make_max_obj.setxattr("size", to_bl("0"));
+ r = io_ctx.operate(client_name, &make_max_obj);
+ if (r < 0) {
+ if (verbose) cout << client_name << ": Setting xattr failed with code "
+ << r
+ << std::endl;
+ }
+
+ return 0;
+}
+
+int KvFlatBtreeAsync::set(const string &key, const bufferlist &val,
+ bool update_on_existing) {
+ if (verbose) cout << client_name << " is "
+ << (update_on_existing? "updating " : "setting ")
+ << key << std::endl;
+ int err = 0;
+ utime_t mytime;
+ index_data idata(key);
+
+ if (verbose) cout << "\t" << client_name << ": finding oid" << std::endl;
+ err = read_index(key, &idata, NULL, false);
+ if (err < 0) {
+ if (verbose) cout << "\t" << client_name
+ << ": getting oid failed with code "
+ << err << std::endl;
+ return err;
+ }
+ if (verbose) cout << "\t" << client_name << ": index data is " << idata.str()
+ << ", object is " << idata.obj << std::endl;
+
+ err = set_op(key, val, update_on_existing, idata);
+
+ if (verbose) cout << "\t" << client_name << ": finished set with " << err
+ << std::endl;
+ return err;
+}
+
+int KvFlatBtreeAsync::set_op(const string &key, const bufferlist &val,
+ bool update_on_existing, index_data &idata) {
+ //write
+
+ bufferlist inbl;
+ omap_set_args args;
+ args.bound = 2 * k;
+ args.exclusive = !update_on_existing;
+ args.omap[key] = val;
+ args.encode(inbl);
+
+ librados::ObjectWriteOperation owo;
+ owo.exec("kvs", "omap_insert", inbl);
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 ) {
+ if (verbose) cout << client_name << " IS SUICIDING!" << std::endl;
+ return -ESUICIDE;
+ }
+ if (verbose) cout << "\t" << client_name << ": inserting " << key
+ << " into object "
+ << idata.obj << std::endl;
+ int err = io_ctx.operate(idata.obj, &owo);
+ if (err < 0) {
+ switch (err) {
+ case -EEXIST: {
+ //the key already exists and this is an exclusive insert.
+ cerr << "\t" << client_name << ": writing key failed with "
+ << err << std::endl;
+ return err;
+ }
+ case -EKEYREJECTED: {
+ //the object needs to be split.
+ do {
+ if (verbose) cout << "\t" << client_name << ": running split on "
+ << idata.obj
+ << std::endl;
+ err = read_index(key, &idata, NULL, true);
+ if (err < 0) {
+ if (verbose) cout << "\t" << client_name
+ << ": getting oid failed with code "
+ << err << std::endl;
+ return err;
+ }
+ err = split(idata);
+ if (err < 0 && err != -ENOENT && err != -EBALANCE) {
+ if (verbose) cerr << "\t" << client_name << ": split failed with "
+ << err << std::endl;
+ int ret = handle_set_rm_errors(err, idata.obj, key, &idata, NULL);
+ switch (ret) {
+ case -ESUICIDE:
+ if (verbose) cout << client_name << " IS SUICIDING!" << std::endl;
+ return ret;
+ break;
+ case 1:
+ return set_op(key, val, update_on_existing, idata);
+ break;
+ case 2:
+ return err;
+ break;
+ }
+ }
+ } while (err < 0 && err != -EBALANCE && err != -ENOENT);
+ err = read_index(key, &idata, NULL, true);
+ if (err < 0) {
+ if (verbose) cout << "\t" << client_name
+ << ": getting oid failed with code "
+ << err << std::endl;
+ return err;
+ }
+ return set_op(key, val, update_on_existing, idata);
+ }
+ default:
+ if (verbose) cerr << "\t" << client_name << ": writing obj failed with "
+ << err << std::endl;
+ if (err == -ENOENT || err == -EACCES) {
+ if (err == -ENOENT) {
+ if (verbose) cout << "CACHE FAILURE" << std::endl;
+ }
+ err = read_index(key, &idata, NULL, true);
+ if (err < 0) {
+ if (verbose) cout << "\t" << client_name
+ << ": getting oid failed with code "
+ << err << std::endl;
+ return err;
+ }
+ if (verbose) cout << "\t" << client_name << ": index data is "
+ << idata.str()
+ << ", object is " << idata.obj << std::endl;
+ return set_op(key, val, update_on_existing, idata);
+ } else {
+ return err;
+ }
+ }
+ }
+ return 0;
+}
+
+int KvFlatBtreeAsync::remove(const string &key) {
+ if (verbose) cout << client_name << ": removing " << key << std::endl;
+ int err = 0;
+ string obj;
+ utime_t mytime;
+ index_data idata;
+ index_data next_idata;
+
+ if (verbose) cout << "\t" << client_name << ": finding oid" << std::endl;
+ err = read_index(key, &idata, &next_idata, false);
+ if (err < 0) {
+ if (verbose) cout << "getting oid failed with code " << err << std::endl;
+ return err;
+ }
+ obj = idata.obj;
+ if (verbose) cout << "\t" << client_name << ": idata is " << idata.str()
+ << ", next_idata is " << next_idata.str()
+ << ", obj is " << obj << std::endl;
+
+ err = remove_op(key, idata, next_idata);
+
+ if (verbose) cout << "\t" << client_name << ": finished remove with " << err
+ << " and exiting" << std::endl;
+ return err;
+}
+
+int KvFlatBtreeAsync::remove_op(const string &key, index_data &idata,
+ index_data &next_idata) {
+ //write
+ bufferlist inbl;
+ omap_rm_args args;
+ args.bound = k;
+ args.omap.insert(key);
+ args.encode(inbl);
+
+ librados::ObjectWriteOperation owo;
+ owo.exec("kvs", "omap_remove", inbl);
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 ) {
+ if (verbose) cout << client_name << " IS SUICIDING!" << std::endl;
+ return -ESUICIDE;
+ }
+ if (verbose) cout << "\t" << client_name << ": removing " << key << " from "
+ << idata.obj
+ << std::endl;
+ int err = io_ctx.operate(idata.obj, &owo);
+ if (err < 0) {
+ if (verbose) cout << "\t" << client_name << ": writing obj failed with "
+ << err << std::endl;
+ switch (err) {
+ case -ENODATA: {
+ //the key does not exist in the object
+ return err;
+ }
+ case -EKEYREJECTED: {
+ //the object needs to be split.
+ do {
+ if (verbose) cerr << "\t" << client_name << ": running rebalance on "
+ << idata.obj << std::endl;
+ err = read_index(key, &idata, &next_idata, true);
+ if (err < 0) {
+ if (verbose) cout << "\t" << client_name
+ << ": getting oid failed with code "
+ << err << std::endl;
+ return err;
+ }
+ err = rebalance(idata, next_idata);
+ if (err < 0 && err != -ENOENT && err != -EBALANCE) {
+ if (verbose) cerr << "\t" << client_name << ": rebalance returned "
+ << err << std::endl;
+ int ret = handle_set_rm_errors(err, idata.obj, key, &idata,
+ &next_idata);
+ switch (ret) {
+ case -ESUICIDE:
+ if (verbose) cout << client_name << " IS SUICIDING!" << std::endl;
+ return err;
+ break;
+ case 1:
+ return remove_op(key, idata, next_idata);
+ break;
+ case 2:
+ return err;
+ break;
+ case -EUCLEAN:
+ //this is the only node, so it's ok to go below k.
+ librados::ObjectWriteOperation owo;
+ bufferlist inbl;
+ omap_rm_args args;
+ args.bound = 0;
+ args.omap.insert(key);
+ args.encode(inbl);
+ owo.exec("kvs", "omap_remove", inbl);
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)()
+ == 1 ) {
+ if (verbose) cout << client_name << " IS SUICIDING!"
+ << std::endl;
+ return -ESUICIDE;
+ }
+ if (verbose) cout << "\t" << client_name << ": removing " << key
+ << " from "
+ << idata.obj
+ << std::endl;
+ int err = io_ctx.operate(idata.obj, &owo);
+ if (err == 0) {
+ return 0;
+ }
+ }
+ }
+ } while (err < 0 && err != -EBALANCE && err != -ENOENT);
+ err = read_index(key, &idata, &next_idata, true);
+ if (err < 0) {
+ if (verbose) cout << "\t" << client_name
+ << ": getting oid failed with code "
+ << err << std::endl;
+ return err;
+ }
+ return remove(key);
+ }
+ default:
+ if (err == -ENOENT || err == -EACCES) {
+ err = read_index(key, &idata, &next_idata, true);
+ if (err < 0) {
+ if (verbose) cout << "\t" << client_name
+ << ": getting oid failed with code "
+ << err << std::endl;
+ return err;
+ }
+ if (verbose) cout << "\t" << client_name << ": index data is "
+ << idata.str()
+ << ", object is " << idata.obj << std::endl;
+ //idea: we read the time every time we read the index anyway - store it.
+ return remove_op(key, idata, next_idata);
+ } else {
+ return err;
+ }
+ }
+ }
+ return 0;
+}
+
+int KvFlatBtreeAsync::handle_set_rm_errors(int &err, string obj,
+ string key,
+ index_data * idata, index_data * next_idata) {
+ if (err == -ESUICIDE) {
+ return err;
+ } else if (err == -ECANCELED //if an object was unwritable or index changed
+ || err == -EPREFIX //if there is currently a prefix
+ || err == -ETIMEDOUT// if the index changes during the op - i.e. cleanup
+ || err == -EACCES) //possible if we were acting on old index data
+ {
+ err = read_index(key, idata, next_idata, true);
+ if (err < 0) {
+ return err;
+ }
+ if (verbose) cout << "\t" << client_name << ": prefix is " << idata->str()
+ << std::endl;
+ if (idata->obj != obj) {
+ //someone else has split or cleaned up or something. start over.
+ return 1;//meaning repeat
+ }
+ } else if (err != -ETIMEDOUT && err != -ERANGE && err != -EACCES
+ && err != -EUCLEAN){
+ if (verbose) cout << "\t" << client_name
+ << ": split encountered an unexpected error: " << err
+ << std::endl;
+ return 2;
+ }
+ return err;
+}
+
+int KvFlatBtreeAsync::get(const string &key, bufferlist *val) {
+ opmap['g']++;
+ if (verbose) cout << client_name << ": getting " << key << std::endl;
+ int err = 0;
+ index_data idata;
+ utime_t mytime;
+
+ if ((((KeyValueStructure *)this)->*KvFlatBtreeAsync::interrupt)() == 1 ) {
+ return -ESUICIDE;
+ }
+ err = read_index(key, &idata, NULL, false);
+ mytime = ceph_clock_now();
+ if (err < 0) {
+ if (verbose) cout << "getting oid failed with code " << err << std::endl;
+ return err;
+ }
+
+ err = get_op(key, val, idata);
+
+ if (verbose) cout << client_name << ": got " << key << " with " << err
+ << std::endl;
+
+ return err;
+}
+
+int KvFlatBtreeAsync::get_op(const string &key, bufferlist *val,
+ index_data &idata) {
+ int err = 0;
+ std::set<std::string> key_set;
+ key_set.insert(key);
+ map<std::string,bufferlist> omap;
+ librados::ObjectReadOperation read;
+ read.omap_get_vals_by_keys(key_set, &omap, &err);
+ err = io_ctx.operate(idata.obj, &read, NULL);
+ if (err < 0) {
+ if (err == -ENOENT) {
+ err = read_index(key, &idata, NULL, true);
+ if (err < 0) {
+ if (verbose) cout << "\t" << client_name
+ << ": getting oid failed with code "
+ << err << std::endl;
+ return err;
+ }
+ if (verbose) cout << "\t" << client_name << ": index data is "
+ << idata.str()
+ << ", object is " << idata.obj << std::endl;
+ return get_op(key, val, idata);
+ } else {
+ if (verbose) cout << client_name
+ << ": get encountered an unexpected error: " << err
+ << std::endl;
+ return err;
+ }
+ }
+
+ *val = omap[key];
+ return err;
+}
+
+void *KvFlatBtreeAsync::pset(void *ptr) {
+ struct aio_set_args *args = (struct aio_set_args *)ptr;
+ *args->err =
+ args->kvba->KvFlatBtreeAsync::set((string)args->key,
+ (bufferlist)args->val, (bool)args->exc);
+ args->cb(args->err, args->cb_args);
+ delete args;
+ return NULL;
+}
+
+void KvFlatBtreeAsync::aio_set(const string &key, const bufferlist &val,
+ bool exclusive, callback cb, void * cb_args, int * err) {
+ aio_set_args *args = new aio_set_args();
+ args->kvba = this;
+ args->key = key;
+ args->val = val;
+ args->exc = exclusive;
+ args->cb = cb;
+ args->cb_args = cb_args;
+ args->err = err;
+ pthread_t t;
+ int r = pthread_create(&t, NULL, pset, (void*)args);
+ if (r < 0) {
+ *args->err = r;
+ return;
+ }
+ pthread_detach(t);
+}
+
+void *KvFlatBtreeAsync::prm(void *ptr) {
+ struct aio_rm_args *args = (struct aio_rm_args *)ptr;
+ *args->err =
+ args->kvba->KvFlatBtreeAsync::remove((string)args->key);
+ args->cb(args->err, args->cb_args);
+ delete args;
+ return NULL;
+}
+
+void KvFlatBtreeAsync::aio_remove(const string &key,
+ callback cb, void * cb_args, int * err) {
+ aio_rm_args * args = new aio_rm_args();
+ args->kvba = this;
+ args->key = key;
+ args->cb = cb;
+ args->cb_args = cb_args;
+ args->err = err;
+ pthread_t t;
+ int r = pthread_create(&t, NULL, prm, (void*)args);
+ if (r < 0) {
+ *args->err = r;
+ return;
+ }
+ pthread_detach(t);
+}
+
+void *KvFlatBtreeAsync::pget(void *ptr) {
+ struct aio_get_args *args = (struct aio_get_args *)ptr;
+ *args->err =
+ args->kvba->KvFlatBtreeAsync::get((string)args->key,
+ (bufferlist *)args->val);
+ args->cb(args->err, args->cb_args);
+ delete args;
+ return NULL;
+}
+
+void KvFlatBtreeAsync::aio_get(const string &key, bufferlist *val,
+ callback cb, void * cb_args, int * err) {
+ aio_get_args * args = new aio_get_args();
+ args->kvba = this;
+ args->key = key;
+ args->val = val;
+ args->cb = cb;
+ args->cb_args = cb_args;
+ args->err = err;
+ pthread_t t;
+ int r = pthread_create(&t, NULL, pget, (void*)args);
+ if (r < 0) {
+ *args->err = r;
+ return;
+ }
+ pthread_detach(t);
+}
+
+int KvFlatBtreeAsync::set_many(const map<string, bufferlist> &in_map) {
+ int err = 0;
+ bufferlist inbl;
+ bufferlist outbl;
+ std::set<string> keys;
+
+ map<string, bufferlist> big_map;
+ for (map<string, bufferlist>::const_iterator it = in_map.begin();
+ it != in_map.end(); ++it) {
+ keys.insert(it->first);
+ big_map.insert(*it);
+ }
+
+ if (verbose) cout << "created key set and big_map" << std::endl;
+
+ encode(keys, inbl);
+ librados::AioCompletion * aioc = rados.aio_create_completion();
+ io_ctx.aio_exec(index_name, aioc, "kvs", "read_many", inbl, &outbl);
+ aioc->wait_for_safe();
+ err = aioc->get_return_value();
+ aioc->release();
+ if (err < 0) {
+ cerr << "getting index failed with " << err << std::endl;
+ return err;
+ }
+
+ map<string, bufferlist> imap;//read from the index
+ auto blit = outbl.cbegin();
+ decode(imap, blit);
+
+ if (verbose) cout << "finished reading index for objects. there are "
+ << imap.size() << " entries that need to be changed. " << std::endl;
+
+
+ vector<object_data> to_delete;
+
+ vector<object_data> to_create;
+
+ if (verbose) cout << "setting up to_delete and to_create vectors from index "
+ << "map" << std::endl;
+ //set up to_delete from index map
+ for (map<string, bufferlist>::iterator it = imap.begin(); it != imap.end();
+ ++it){
+ index_data idata;
+ blit = it->second.begin();
+ idata.decode(blit);
+ to_delete.push_back(object_data(idata.min_kdata, idata.kdata, idata.obj));
+ err = read_object(idata.obj, &to_delete[to_delete.size() - 1]);
+ if (err < 0) {
+ if (verbose) cout << "reading " << idata.obj << " failed with " << err
+ << std::endl;
+ return set_many(in_map);
+ }
+
+ big_map.insert(to_delete[to_delete.size() - 1].omap.begin(),
+ to_delete[to_delete.size() - 1].omap.end());
+ }
+
+ to_create.push_back(object_data(
+ to_string(client_name, client_index++)));
+ to_create[0].min_kdata = to_delete[0].min_kdata;
+
+ for(map<string, bufferlist>::iterator it = big_map.begin();
+ it != big_map.end(); ++it) {
+ if (to_create[to_create.size() - 1].omap.size() == 1.5 * k) {
+ to_create[to_create.size() - 1].max_kdata =
+ key_data(to_create[to_create.size() - 1]
+ .omap.rbegin()->first);
+
+ to_create.push_back(object_data(
+ to_string(client_name, client_index++)));
+ to_create[to_create.size() - 1].min_kdata =
+ to_create[to_create.size() - 2].max_kdata;
+ }
+
+ to_create[to_create.size() - 1].omap.insert(*it);
+ }
+ to_create[to_create.size() - 1].max_kdata =
+ to_delete[to_delete.size() - 1].max_kdata;
+
+ vector<librados::ObjectWriteOperation> owos(2 + 2 * to_delete.size()
+ + to_create.size());
+ vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > ops;
+
+
+ index_data idata;
+ set_up_prefix_index(to_create, to_delete, &owos[0], &idata, &err);
+
+ if (verbose) cout << "finished making to_create and to_delete. "
+ << std::endl;
+
+ ops.push_back(make_pair(
+ pair<int, string>(ADD_PREFIX, index_name),
+ &owos[0]));
+ for (int i = 1; i < 2 + 2 * (int)to_delete.size() + (int)to_create.size();
+ i++) {
+ ops.push_back(make_pair(make_pair(0,""), &owos[i]));
+ }
+
+ set_up_ops(to_create, to_delete, &ops, idata, &err);
+
+ cout << "finished setting up ops. Starting critical section..." << std::endl;
+
+ /////BEGIN CRITICAL SECTION/////
+ //put prefix on index entry for idata.val
+ err = perform_ops("\t\t" + client_name + "-set_many:", idata, &ops);
+ if (err < 0) {
+ return set_many(in_map);
+ }
+ if (verbose) cout << "\t\t" << client_name << "-split: done splitting."
+ << std::endl;
+ /////END CRITICAL SECTION/////
+ icache_lock.Lock();
+ for (vector<delete_data>::iterator it = idata.to_delete.begin();
+ it != idata.to_delete.end(); ++it) {
+ icache.erase(it->max);
+ }
+ for (vector<create_data>::iterator it = idata.to_create.begin();
+ it != idata.to_create.end(); ++it) {
+ icache.push(index_data(*it));
+ }
+ icache_lock.Unlock();
+ return err;
+}
+
+int KvFlatBtreeAsync::remove_all() {
+ if (verbose) cout << client_name << ": removing all" << std::endl;
+ int err = 0;
+ librados::ObjectReadOperation oro;
+ librados::AioCompletion * oro_aioc = rados.aio_create_completion();
+ std::map<std::string, bufferlist> index_set;
+ oro.omap_get_vals2("",LONG_MAX,&index_set, nullptr, &err);
+ err = io_ctx.aio_operate(index_name, oro_aioc, &oro, NULL);
+ if (err < 0){
+ if (err == -ENOENT) {
+ return 0;
+ }
+ if (verbose) cout << "getting keys failed with error " << err << std::endl;
+ return err;
+ }
+ oro_aioc->wait_for_safe();
+ oro_aioc->release();
+
+ librados::ObjectWriteOperation rm_index;
+ librados::AioCompletion * rm_index_aioc = rados.aio_create_completion();
+ map<std::string,bufferlist> new_index;
+ new_index["1"] = index_set["1"];
+ rm_index.omap_clear();
+ rm_index.omap_set(new_index);
+ io_ctx.aio_operate(index_name, rm_index_aioc, &rm_index);
+ err = rm_index_aioc->get_return_value();
+ rm_index_aioc->release();
+ if (err < 0) {
+ if (verbose) cout << "rm index aioc failed with " << err
+ << std::endl;
+ return err;
+ }
+
+ if (!index_set.empty()) {
+ for (std::map<std::string,bufferlist>::iterator it = index_set.begin();
+ it != index_set.end(); ++it){
+ librados::ObjectWriteOperation sub;
+ if (it->first == "1") {
+ sub.omap_clear();
+ } else {
+ sub.remove();
+ }
+ index_data idata;
+ auto b = it->second.cbegin();
+ idata.decode(b);
+ io_ctx.operate(idata.obj, &sub);
+ }
+ }
+
+ icache.clear();
+
+ return 0;
+}
+
+int KvFlatBtreeAsync::get_all_keys(std::set<std::string> *keys) {
+ if (verbose) cout << client_name << ": getting all keys" << std::endl;
+ int err = 0;
+ librados::ObjectReadOperation oro;
+ std::map<std::string,bufferlist> index_set;
+ oro.omap_get_vals2("",LONG_MAX,&index_set, nullptr, &err);
+ io_ctx.operate(index_name, &oro, NULL);
+ if (err < 0){
+ if (verbose) cout << "getting keys failed with error " << err << std::endl;
+ return err;
+ }
+ for (std::map<std::string,bufferlist>::iterator it = index_set.begin();
+ it != index_set.end(); ++it){
+ librados::ObjectReadOperation sub;
+ std::set<std::string> ret;
+ sub.omap_get_keys2("",LONG_MAX,&ret, nullptr, &err);
+ index_data idata;
+ auto b = it->second.cbegin();
+ idata.decode(b);
+ io_ctx.operate(idata.obj, &sub, NULL);
+ keys->insert(ret.begin(), ret.end());
+ }
+ return err;
+}
+
+int KvFlatBtreeAsync::get_all_keys_and_values(
+ map<std::string,bufferlist> *kv_map) {
+ if (verbose) cout << client_name << ": getting all keys and values"
+ << std::endl;
+ int err = 0;
+ librados::ObjectReadOperation first_read;
+ std::set<std::string> index_set;
+ first_read.omap_get_keys2("",LONG_MAX,&index_set, nullptr, &err);
+ io_ctx.operate(index_name, &first_read, NULL);
+ if (err < 0){
+ if (verbose) cout << "getting keys failed with error " << err << std::endl;
+ return err;
+ }
+ for (std::set<std::string>::iterator it = index_set.begin();
+ it != index_set.end(); ++it){
+ librados::ObjectReadOperation sub;
+ map<std::string, bufferlist> ret;
+ sub.omap_get_vals2("",LONG_MAX,&ret, nullptr, &err);
+ io_ctx.operate(*it, &sub, NULL);
+ kv_map->insert(ret.begin(), ret.end());
+ }
+ return err;
+}
+
+bool KvFlatBtreeAsync::is_consistent() {
+ int err;
+ bool ret = true;
+ if (verbose) cout << client_name << ": checking consistency" << std::endl;
+ std::map<std::string,bufferlist> index;
+ map<std::string, std::set<std::string> > sub_objs;
+ librados::ObjectReadOperation oro;
+ oro.omap_get_vals2("",LONG_MAX,&index, nullptr, &err);
+ io_ctx.operate(index_name, &oro, NULL);
+ if (err < 0){
+ //probably because the index doesn't exist - this might be ok.
+ for (librados::NObjectIterator oit = io_ctx.nobjects_begin();
+ oit != io_ctx.nobjects_end(); ++oit) {
+ //if this executes, there are floating objects.
+ cerr << "Not consistent! found floating object " << oit->get_oid()
+ << std::endl;
+ ret = false;
+ }
+ return ret;
+ }
+
+ std::map<std::string, string> parsed_index;
+ std::set<std::string> onames;
+ std::set<std::string> special_names;
+ for (map<std::string,bufferlist>::iterator it = index.begin();
+ it != index.end(); ++it) {
+ if (it->first != "") {
+ index_data idata;
+ auto b = it->second.cbegin();
+ idata.decode(b);
+ if (idata.prefix != "") {
+ for(vector<delete_data>::iterator dit = idata.to_delete.begin();
+ dit != idata.to_delete.end(); ++dit) {
+ librados::ObjectReadOperation oro;
+ librados::AioCompletion * aioc = rados.aio_create_completion();
+ bufferlist un;
+ oro.getxattr("unwritable", &un, &err);
+ io_ctx.aio_operate(dit->obj, aioc, &oro, NULL);
+ aioc->wait_for_safe();
+ err = aioc->get_return_value();
+ if (ceph_clock_now() - idata.ts > timeout) {
+ if (err < 0) {
+ aioc->release();
+ if (err == -ENOENT) {
+ continue;
+ } else {
+ cerr << "Not consistent! reading object " << dit->obj
+ << "returned " << err << std::endl;
+ ret = false;
+ break;
+ }
+ }
+ if (atoi(string(un.c_str(), un.length()).c_str()) != 1 &&
+ aioc->get_version64() != dit->version) {
+ cerr << "Not consistent! object " << dit->obj << " has been "
+ << " modified since the client died was not cleaned up."
+ << std::endl;
+ ret = false;
+ }
+ }
+ special_names.insert(dit->obj);
+ aioc->release();
+ }
+ for(vector<create_data >::iterator cit = idata.to_create.begin();
+ cit != idata.to_create.end(); ++cit) {
+ special_names.insert(cit->obj);
+ }
+ }
+ parsed_index.insert(make_pair(it->first, idata.obj));
+ onames.insert(idata.obj);
+ }
+ }
+
+ //make sure that an object exists iff it either is the index
+ //or is listed in the index
+ for (librados::NObjectIterator oit = io_ctx.nobjects_begin();
+ oit != io_ctx.nobjects_end(); ++oit) {
+ string name = oit->get_oid();
+ if (name != index_name && onames.count(name) == 0
+ && special_names.count(name) == 0) {
+ cerr << "Not consistent! found floating object " << name << std::endl;
+ ret = false;
+ }
+ }
+
+ //check objects
+ string prev = "";
+ for (std::map<std::string, string>::iterator it = parsed_index.begin();
+ it != parsed_index.end();
+ ++it) {
+ librados::ObjectReadOperation read;
+ read.omap_get_keys2("", LONG_MAX, &sub_objs[it->second], nullptr, &err);
+ err = io_ctx.operate(it->second, &read, NULL);
+ int size_int = (int)sub_objs[it->second].size();
+
+ //check that size is in the right range
+ if (it->first != "1" && special_names.count(it->second) == 0 &&
+ err != -ENOENT && (size_int > 2*k|| size_int < k)
+ && parsed_index.size() > 1) {
+ cerr << "Not consistent! Object " << *it << " has size " << size_int
+ << ", which is outside the acceptable range." << std::endl;
+ ret = false;
+ }
+
+ //check that all keys belong in that object
+ for(std::set<std::string>::iterator subit = sub_objs[it->second].begin();
+ subit != sub_objs[it->second].end(); ++subit) {
+ if ((it->first != "1"
+ && *subit > it->first.substr(1,it->first.length()))
+ || *subit <= prev) {
+ cerr << "Not consistent! key " << *subit << " does not belong in "
+ << *it << std::endl;
+ cerr << "not last element, i.e. " << it->first << " not equal to 1? "
+ << (it->first != "1") << std::endl
+ << "greater than " << it->first.substr(1,it->first.length())
+ <<"? " << (*subit > it->first.substr(1,it->first.length()))
+ << std::endl
+ << "less than or equal to " << prev << "? "
+ << (*subit <= prev) << std::endl;
+ ret = false;
+ }
+ }
+
+ prev = it->first.substr(1,it->first.length());
+ }
+
+ if (!ret) {
+ if (verbose) cout << "failed consistency test - see error log"
+ << std::endl;
+ cerr << str();
+ } else {
+ if (verbose) cout << "passed consistency test" << std::endl;
+ }
+ return ret;
+}
+
+string KvFlatBtreeAsync::str() {
+ stringstream ret;
+ ret << "Top-level map:" << std::endl;
+ int err = 0;
+ std::set<std::string> keys;
+ std::map<std::string,bufferlist> index;
+ librados::ObjectReadOperation oro;
+ librados::AioCompletion * top_aioc = rados.aio_create_completion();
+ oro.omap_get_vals2("",LONG_MAX,&index, nullptr, &err);
+ io_ctx.aio_operate(index_name, top_aioc, &oro, NULL);
+ top_aioc->wait_for_safe();
+ err = top_aioc->get_return_value();
+ top_aioc->release();
+ if (err < 0 && err != -5){
+ if (verbose) cout << "getting keys failed with error " << err << std::endl;
+ return ret.str();
+ }
+ if(index.empty()) {
+ ret << "There are no objects!" << std::endl;
+ return ret.str();
+ }
+
+ for (map<std::string,bufferlist>::iterator it = index.begin();
+ it != index.end(); ++it) {
+ keys.insert(string(it->second.c_str(), it->second.length())
+ .substr(1,it->second.length()));
+ }
+
+ vector<std::string> all_names;
+ vector<int> all_sizes(index.size());
+ vector<int> all_versions(index.size());
+ vector<bufferlist> all_unwrit(index.size());
+ vector<map<std::string,bufferlist> > all_maps(keys.size());
+ vector<map<std::string,bufferlist>::iterator> its(keys.size());
+ unsigned done = 0;
+ vector<bool> dones(keys.size());
+ ret << std::endl << string(150,'-') << std::endl;
+
+ for (map<std::string,bufferlist>::iterator it = index.begin();
+ it != index.end(); ++it){
+ index_data idata;
+ auto b = it->second.cbegin();
+ idata.decode(b);
+ string s = idata.str();
+ ret << "|" << string((148 -
+ ((*it).first.length()+s.length()+3))/2,' ');
+ ret << (*it).first;
+ ret << " | ";
+ ret << string(idata.str());
+ ret << string((148 -
+ ((*it).first.length()+s.length()+3))/2,' ');
+ ret << "|\t";
+ all_names.push_back(idata.obj);
+ ret << std::endl << string(150,'-') << std::endl;
+ }
+
+ int indexer = 0;
+
+ //get the object names and sizes
+ for(vector<std::string>::iterator it = all_names.begin(); it
+ != all_names.end();
+ ++it) {
+ librados::ObjectReadOperation oro;
+ librados::AioCompletion *aioc = rados.aio_create_completion();
+ oro.omap_get_vals2("", LONG_MAX, &all_maps[indexer], nullptr, &err);
+ oro.getxattr("unwritable", &all_unwrit[indexer], &err);
+ io_ctx.aio_operate(*it, aioc, &oro, NULL);
+ aioc->wait_for_safe();
+ if (aioc->get_return_value() < 0) {
+ ret << "reading" << *it << "failed: " << err << std::endl;
+ //return ret.str();
+ }
+ all_sizes[indexer] = all_maps[indexer].size();
+ all_versions[indexer] = aioc->get_version64();
+ indexer++;
+ aioc->release();
+ }
+
+ ret << "///////////////////OBJECT NAMES////////////////" << std::endl;
+ //HEADERS
+ ret << std::endl;
+ for (int i = 0; i < indexer; i++) {
+ ret << "---------------------------\t";
+ }
+ ret << std::endl;
+ for (int i = 0; i < indexer; i++) {
+ ret << "|" << string((25 -
+ (string("Bucket: ").length() + all_names[i].length()))/2, ' ');
+ ret << "Bucket: " << all_names[i];
+ ret << string((25 -
+ (string("Bucket: ").length() + all_names[i].length()))/2, ' ') << "|\t";
+ }
+ ret << std::endl;
+ for (int i = 0; i < indexer; i++) {
+ its[i] = all_maps[i].begin();
+ ret << "|" << string((25 - (string("size: ").length()
+ + to_string("",all_sizes[i]).length()))/2, ' ');
+ ret << "size: " << all_sizes[i];
+ ret << string((25 - (string("size: ").length()
+ + to_string("",all_sizes[i]).length()))/2, ' ') << "|\t";
+ }
+ ret << std::endl;
+ for (int i = 0; i < indexer; i++) {
+ its[i] = all_maps[i].begin();
+ ret << "|" << string((25 - (string("version: ").length()
+ + to_string("",all_versions[i]).length()))/2, ' ');
+ ret << "version: " << all_versions[i];
+ ret << string((25 - (string("version: ").length()
+ + to_string("",all_versions[i]).length()))/2, ' ') << "|\t";
+ }
+ ret << std::endl;
+ for (int i = 0; i < indexer; i++) {
+ its[i] = all_maps[i].begin();
+ ret << "|" << string((25 - (string("unwritable? ").length()
+ + 1))/2, ' ');
+ ret << "unwritable? " << string(all_unwrit[i].c_str(),
+ all_unwrit[i].length());
+ ret << string((25 - (string("unwritable? ").length()
+ + 1))/2, ' ') << "|\t";
+ }
+ ret << std::endl;
+ for (int i = 0; i < indexer; i++) {
+ ret << "---------------------------\t";
+ }
+ ret << std::endl;
+ ret << "///////////////////THE ACTUAL BLOCKS////////////////" << std::endl;
+
+
+ ret << std::endl;
+ for (int i = 0; i < indexer; i++) {
+ ret << "---------------------------\t";
+ }
+ ret << std::endl;
+ //each time through this part is two lines
+ while(done < keys.size()) {
+ for(int i = 0; i < indexer; i++) {
+ if(dones[i]){
+ ret << " \t";
+ } else {
+ if (its[i] == all_maps[i].end()){
+ done++;
+ dones[i] = true;
+ ret << " \t";
+ } else {
+ ret << "|" << string((25 -
+ ((*its[i]).first.length()+its[i]->second.length()+3))/2,' ');
+ ret << (*its[i]).first;
+ ret << " | ";
+ ret << string(its[i]->second.c_str(), its[i]->second.length());
+ ret << string((25 -
+ ((*its[i]).first.length()+its[i]->second.length()+3))/2,' ');
+ ret << "|\t";
+ ++(its[i]);
+ }
+
+ }
+ }
+ ret << std::endl;
+ for (int i = 0; i < indexer; i++) {
+ if(dones[i]){
+ ret << " \t";
+ } else {
+ ret << "---------------------------\t";
+ }
+ }
+ ret << std::endl;
+
+ }
+ return ret.str();
+}
diff --git a/src/key_value_store/kv_flat_btree_async.h b/src/key_value_store/kv_flat_btree_async.h
new file mode 100644
index 00000000..346e4056
--- /dev/null
+++ b/src/key_value_store/kv_flat_btree_async.h
@@ -0,0 +1,899 @@
+/*
+ * Uses a two-level B-tree to store a set of key-value pairs.
+ *
+ * September 2, 2012
+ * Eleanor Cawthon
+ * eleanor.cawthon@inktank.com
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef KVFLATBTREEASYNC_H_
+#define KVFLATBTREEASYNC_H_
+
+#define ESUICIDE 134
+#define EPREFIX 136
+#define EFIRSTOBJ 138
+
+#include "key_value_store/key_value_structure.h"
+#include "include/utime.h"
+#include "include/types.h"
+#include "include/encoding.h"
+#include "common/Mutex.h"
+#include "common/Clock.h"
+#include "common/Formatter.h"
+#include "global/global_context.h"
+#include "include/rados/librados.hpp"
+#include <cfloat>
+#include <queue>
+#include <sstream>
+#include <stdarg.h>
+
+using ceph::bufferlist;
+
+enum {
+ ADD_PREFIX = 1,
+ MAKE_OBJECT = 2,
+ UNWRITE_OBJECT = 3,
+ RESTORE_OBJECT = 4,
+ REMOVE_OBJECT = 5,
+ REMOVE_PREFIX = 6,
+ AIO_MAKE_OBJECT = 7
+};
+
+struct rebalance_args;
+
+
+/**
+ * stores information about a key in the index.
+ *
+ * prefix is "0" unless key is "", in which case it is "1". This ensures that
+ * the object with key "" will always be the highest key in the index.
+ */
+struct key_data {
+ string raw_key;
+ string prefix;
+
+ key_data()
+ {}
+
+ /**
+ * @pre: key is a raw key (does not contain a prefix)
+ */
+ key_data(string key)
+ : raw_key(key)
+ {
+ raw_key == "" ? prefix = "1" : prefix = "0";
+ }
+
+ bool operator==(key_data k) const {
+ return ((raw_key == k.raw_key) && (prefix == k.prefix));
+ }
+
+ bool operator!=(key_data k) const {
+ return ((raw_key != k.raw_key) || (prefix != k.prefix));
+ }
+
+ bool operator<(key_data k) const {
+ return this->encoded() < k.encoded();
+ }
+
+ bool operator>(key_data k) const {
+ return this->encoded() > k.encoded();
+ }
+
+ /**
+ * parses the prefix from encoded and stores the data in this.
+ *
+ * @pre: encoded has a prefix
+ */
+ void parse(string encoded) {
+ prefix = encoded[0];
+ raw_key = encoded.substr(1,encoded.length());
+ }
+
+ /**
+ * returns a string containing the encoded (prefixed) key
+ */
+ string encoded() const {
+ return prefix + raw_key;
+ }
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(raw_key, bl);
+ encode(prefix, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(raw_key, p);
+ decode(prefix, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(key_data)
+
+
+/**
+ * Stores information read from a librados object.
+ */
+struct object_data {
+ key_data min_kdata; //the max key from the previous index entry
+ key_data max_kdata; //the max key, from the index
+ string name; //the object's name
+ map<std::string, bufferlist> omap; // the omap of the object
+ bool unwritable; // an xattr that, if false, means an op is in
+ // progress and other clients should not write to it.
+ uint64_t version; //the version at time of read
+ uint64_t size; //the number of elements in the omap
+
+ object_data()
+ : unwritable(false),
+ version(0),
+ size(0)
+ {}
+
+ object_data(string the_name)
+ : name(the_name),
+ unwritable(false),
+ version(0),
+ size(0)
+ {}
+
+ object_data(key_data min, key_data kdat, string the_name)
+ : min_kdata(min),
+ max_kdata(kdat),
+ name(the_name),
+ unwritable(false),
+ version(0),
+ size(0)
+ {}
+
+ object_data(key_data min, key_data kdat, string the_name,
+ map<std::string, bufferlist> the_omap)
+ : min_kdata(min),
+ max_kdata(kdat),
+ name(the_name),
+ omap(the_omap),
+ unwritable(false),
+ version(0),
+ size(0)
+ {}
+
+ object_data(key_data min, key_data kdat, string the_name, int the_version)
+ : min_kdata(min),
+ max_kdata(kdat),
+ name(the_name),
+ unwritable(false),
+ version(the_version),
+ size(0)
+ {}
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(min_kdata, bl);
+ encode(max_kdata, bl);
+ encode(name, bl);
+ encode(omap, bl);
+ encode(unwritable, bl);
+ encode(version, bl);
+ encode(size, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(min_kdata, p);
+ decode(max_kdata, p);
+ decode(name, p);
+ decode(omap, p);
+ decode(unwritable, p);
+ decode(version, p);
+ decode(size, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(object_data)
+
+/**
+ * information about objects to be created by a split or merge - stored in the
+ * index_data.
+ */
+struct create_data {
+ key_data min;
+ key_data max;
+ string obj;
+
+ create_data()
+ {}
+
+ create_data(key_data n, key_data x, string o)
+ : min(n),
+ max(x),
+ obj(o)
+ {}
+
+ create_data(object_data o)
+ : min(o.min_kdata),
+ max(o.max_kdata),
+ obj(o.name)
+ {}
+
+ create_data & operator=(const create_data &c) {
+ min = c.min;
+ max = c.max;
+ obj = c.obj;
+ return *this;
+ }
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(min, bl);
+ encode(max, bl);
+ encode(obj, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(min, p);
+ decode(max, p);
+ decode(obj, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(create_data)
+
+/**
+ * information about objects to be deleted by a split or merge - stored in the
+ * index_data.
+ */
+struct delete_data {
+ key_data min;
+ key_data max;
+ string obj;
+ uint64_t version;
+
+ delete_data()
+ : version(0)
+ {}
+
+ delete_data(key_data n, key_data x, string o, uint64_t v)
+ : min(n),
+ max(x),
+ obj(o),
+ version(v)
+ {}
+
+ delete_data & operator=(const delete_data &d) {
+ min = d.min;
+ max = d.max;
+ obj = d.obj;
+ version = d.version;
+ return *this;
+ }
+
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(min, bl);
+ encode(max, bl);
+ encode(obj, bl);
+ encode(version, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(min, p);
+ decode(max, p);
+ decode(obj, p);
+ decode(version, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(delete_data)
+
+/**
+ * The index object is a key value map that stores
+ * the highest key stored in an object as keys, and an index_data
+ * as the corresponding value. The index_data contains the encoded
+ * high and low keys (where keys in this object are > min_kdata and
+ * <= kdata), the name of the librados object where keys containing
+ * that range of keys are located, and information about split and
+ * merge operations that may need to be cleaned up if a client dies.
+ */
+struct index_data {
+ //the encoded key corresponding to the object
+ key_data kdata;
+
+ //"1" if there is a prefix (because a split or merge is
+ //in progress), otherwise ""
+ string prefix;
+
+ //the kdata of the previous index entry
+ key_data min_kdata;
+
+ utime_t ts; //time that a split/merge started
+
+ //objects to be created
+ vector<create_data > to_create;
+
+ //objects to be deleted
+ vector<delete_data > to_delete;
+
+ //the name of the object where the key range is located.
+ string obj;
+
+ index_data()
+ {}
+
+ index_data(string raw_key)
+ : kdata(raw_key)
+ {}
+
+ index_data(key_data max, key_data min, string o)
+ : kdata(max),
+ min_kdata(min),
+ obj(o)
+ {}
+
+ index_data(create_data c)
+ : kdata(c.max),
+ min_kdata(c.min),
+ obj(c.obj)
+ {}
+
+ bool operator<(const index_data &other) const {
+ return (kdata.encoded() < other.kdata.encoded());
+ }
+
+ //true if there is a prefix and now - ts > timeout.
+ bool is_timed_out(utime_t now, utime_t timeout) const;
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(prefix, bl);
+ encode(min_kdata, bl);
+ encode(kdata, bl);
+ encode(ts, bl);
+ encode(to_create, bl);
+ encode(to_delete, bl);
+ encode(obj, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(prefix, p);
+ decode(min_kdata, p);
+ decode(kdata, p);
+ decode(ts, p);
+ decode(to_create, p);
+ decode(to_delete, p);
+ decode(obj, p);
+ DECODE_FINISH(p);
+ }
+
+ /*
+ * Prints a string representation of the information, in the following format:
+ * (min_kdata/
+ * kdata,
+ * prefix
+ * ts
+ * elements of to_create, organized into (high key| obj name)
+ * ;
+ * elements of to_delete, organized into (high key| obj name | version number)
+ * :
+ * val)
+ */
+ string str() const {
+ stringstream strm;
+ strm << '(' << min_kdata.encoded() << "/" << kdata.encoded() << ','
+ << prefix;
+ if (prefix == "1") {
+ strm << ts.sec() << '.' << ts.usec();
+ for(vector<create_data>::const_iterator it = to_create.begin();
+ it != to_create.end(); ++it) {
+ strm << '(' << it->min.encoded() << '/' << it->max.encoded() << '|'
+ << it->obj << ')';
+ }
+ strm << ';';
+ for(vector<delete_data >::const_iterator it = to_delete.begin();
+ it != to_delete.end(); ++it) {
+ strm << '(' << it->min.encoded() << '/' << it->max.encoded() << '|'
+ << it->obj << '|'
+ << it->version << ')';
+ }
+ strm << ':';
+ }
+ strm << obj << ')';
+ return strm.str();
+ }
+};
+WRITE_CLASS_ENCODER(index_data)
+
+/**
+ * Structure to store information read from the index for reuse.
+ */
+class IndexCache {
+protected:
+ map<key_data, pair<index_data, utime_t> > k2itmap;
+ map<utime_t, key_data> t2kmap;
+ int cache_size;
+
+public:
+ IndexCache(int n)
+ : cache_size(n)
+ {}
+ /**
+ * Inserts idata into the cache and removes whatever key mapped to before.
+ * If the cache is full, pops the oldest entry.
+ */
+ void push(const string &key, const index_data &idata);
+
+ /**
+ * Inserts idata into the cache. If idata.kdata is already in the cache,
+ * replaces the old one. Pops the oldest entry if the cache is full.
+ */
+ void push(const index_data &idata);
+
+ /**
+ * Removes the oldest entry from the cache
+ */
+ void pop();
+
+ /**
+ * Removes the value associated with kdata from both maps
+ */
+ void erase(key_data kdata);
+
+ /**
+ * gets the idata where key belongs. If none, returns -ENODATA.
+ */
+ int get(const string &key, index_data *idata) const;
+
+ /**
+ * Gets the idata where key goes and the one after it. If there are not
+ * valid entries for both of them, returns -ENODATA.
+ */
+ int get(const string &key, index_data *idata, index_data * next_idata) const;
+ void clear();
+};
+
+class KvFlatBtreeAsync;
+
+
+/**
+ * These are used internally to translate aio operations into useful thread
+ * arguments.
+ */
+struct aio_set_args {
+ KvFlatBtreeAsync * kvba;
+ string key;
+ bufferlist val;
+ bool exc;
+ callback cb;
+ void * cb_args;
+ int * err;
+};
+
+struct aio_rm_args {
+ KvFlatBtreeAsync * kvba;
+ string key;
+ callback cb;
+ void * cb_args;
+ int * err;
+};
+
+struct aio_get_args {
+ KvFlatBtreeAsync * kvba;
+ string key;
+ bufferlist * val;
+ bool exc;
+ callback cb;
+ void * cb_args;
+ int * err;
+};
+
+class KvFlatBtreeAsync : public KeyValueStructure {
+protected:
+
+ //don't change these once operations start being called - they are not
+ //protected with mutexes!
+ int k;
+ string index_name;
+ librados::IoCtx io_ctx;
+ string rados_id;
+ string client_name;
+ librados::Rados rados;
+ string pool_name;
+ injection_t interrupt;
+ int wait_ms;
+ utime_t timeout; //declare a client dead if it goes this long without
+ //finishing a split/merge
+ int cache_size;
+ double cache_refresh; //read cache_size / cache_refresh entries each time the
+ //index is read
+ bool verbose;//if true, display lots of debug output
+
+ //shared variables protected with mutexes
+ Mutex client_index_lock;
+ int client_index; //names of new objects are client_name.client_index
+ Mutex icache_lock;
+ IndexCache icache;
+ friend struct index_data;
+
+ /**
+ * finds the object in the index with the lowest key value that is greater
+ * than idata.kdata. If idata.kdata is the max key, returns -EOVERFLOW. If
+ * idata has a prefix and has timed out, cleans up.
+ *
+ * @param idata: idata for the object to search for.
+ * @param out_data: the idata for the next object.
+ *
+ * @pre: idata must contain a key_data.
+ * @post: out_data contains complete information
+ */
+ int next(const index_data &idata, index_data * out_data);
+
+ /**
+ * finds the object in the index with the lowest key value that is greater
+ * than idata.kdata. If idata.kdata is the lowest key, returns -ERANGE If
+ * idata has a prefix and has timed out, cleans up.
+ *
+ * @param idata: idata for the object to search for.
+ * @param out_data: the idata for the next object.
+ *
+ * @pre: idata must contain a key_data.
+ * @post: out_data contains complete information
+ */
+ int prev(const index_data &idata, index_data * out_data);
+
+ /**
+ * finds the index_data where a key belongs, from cache if possible. If it
+ * reads the index object, it will read the first cache_size entries after
+ * key and put them in the cache.
+ *
+ * @param key: the key to search for
+ * @param idata: the index_data for the first index value such that idata.key
+ * is greater than key.
+ * @param next_idata: if not NULL, this will be set to the idata after idata
+ * @param force_update: if false, will try to read from cache first.
+ *
+ * @pre: key is not encoded
+ * @post: idata contains complete information
+ * stored
+ */
+ int read_index(const string &key, index_data * idata,
+ index_data * next_idata, bool force_update);
+
+ /**
+ * Reads obj and generates information about it. Iff the object has >= 2k
+ * entries, reads the whole omap and then splits it.
+ *
+ * @param idata: index data for the object being split
+ * @pre: idata contains a key and an obj
+ * @post: idata.obj has been split and icache has been updated
+ * @return -EBALANCE if obj does not need to be split, 0 if split successful,
+ * error from read_object or perform_ops if there is one.
+ */
+ int split(const index_data &idata);
+
+ /**
+ * reads o1 and the next object after o1 and, if necessary, rebalances them.
+ * if hk1 is the highest key in the index, calls rebalance on the next highest
+ * key.
+ *
+ * @param idata: index data for the object being rebalanced
+ * @param next_idata: index data for the next object. If blank, will read.
+ * @pre: idata contains a key and an obj
+ * @post: idata.obj has been rebalanced and icache has been updated
+ * @return -EBALANCE if no change needed, -ENOENT if o1 does not exist,
+ * -ECANCELED if second object does not exist, otherwise, error from
+ * perform_ops
+ */
+ int rebalance(const index_data &idata1, const index_data &next_idata);
+
+ /**
+ * performs an ObjectReadOperation to populate odata
+ *
+ * @post: odata has all information about obj except for key (which is "")
+ */
+ int read_object(const string &obj, object_data * odata);
+
+ /**
+ * performs a maybe_read_for_balance ObjectOperation so the omap is only
+ * read if the object is out of bounds.
+ */
+ int read_object(const string &obj, rebalance_args * args);
+
+ /**
+ * sets up owo to change the index in preparation for a split/merge.
+ *
+ * @param to_create: vector of object_data to be created.
+ * @param to_delete: vector of object_data to be deleted.
+ * @param owo: the ObjectWriteOperation to set up
+ * @param idata: will be populated by index data for this op.
+ * @param err: error code reference to pass to omap_cmp
+ * @pre: entries in to_create and to_delete must have keys and names.
+ */
+ void set_up_prefix_index(
+ const vector<object_data> &to_create,
+ const vector<object_data> &to_delete,
+ librados::ObjectWriteOperation * owo,
+ index_data * idata,
+ int * err);
+
+ /**
+ * sets up all make, mark, restore, and delete ops, as well as the remove
+ * prefix op, based on idata.
+ *
+ * @param create_vector: vector of data about the objects to be created.
+ * @pre: entries in create_data must have names and omaps and be in idata
+ * order
+ * @param delete_vector: vector of data about the objects to be deleted
+ * @pre: entries in to_delete must have versions and be in idata order
+ * @param ops: the owos to set up. the pair is a pair of op identifiers
+ * and names of objects - set_up_ops fills these in.
+ * @pre: ops must be the correct size and the ObjectWriteOperation pointers
+ * must be valid.
+ * @param idata: the idata with information about how to set up the ops
+ * @pre: idata has valid to_create and to_delete
+ * @param err: the int to get the error value for omap_cmp
+ */
+ void set_up_ops(
+ const vector<object_data> &create_vector,
+ const vector<object_data> &delete_vector,
+ vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > * ops,
+ const index_data &idata,
+ int * err);
+
+ /**
+ * sets up owo to exclusive create, set omap to to_set, and set
+ * unwritable to "0"
+ */
+ void set_up_make_object(
+ const map<std::string, bufferlist> &to_set,
+ librados::ObjectWriteOperation *owo);
+
+ /**
+ * sets up owo to assert object version and that object version is
+ * writable,
+ * then mark it unwritable.
+ *
+ * @param ver: if this is 0, no version is asserted.
+ */
+ void set_up_unwrite_object(
+ const int &ver, librados::ObjectWriteOperation *owo);
+
+ /**
+ * sets up owo to assert that an object is unwritable and then mark it
+ * writable
+ */
+ void set_up_restore_object(
+ librados::ObjectWriteOperation *owo);
+
+ /**
+ * sets up owo to assert that the object is unwritable and then remove it
+ */
+ void set_up_delete_object(
+ librados::ObjectWriteOperation *owo);
+
+ /**
+ * perform the operations in ops and handles errors.
+ *
+ * @param debug_prefix: what to print at the beginning of debug output
+ * @param idata: the idata for the object being operated on, to be
+ * passed to cleanup if necessary
+ * @param ops: this contains an int identifying the type of op,
+ * a string that is the name of the object to operate on, and a pointer
+ * to the ObjectWriteOperation to use. All of this must be complete.
+ * @post: all operations are performed and most errors are handled
+ * (e.g., cleans up if an assertion fails). If an unknown error is found,
+ * returns it.
+ */
+ int perform_ops( const string &debug_prefix,
+ const index_data &idata,
+ vector<pair<pair<int, string>, librados::ObjectWriteOperation*> > * ops);
+
+ /**
+ * Called when a client discovers that another client has died during a
+ * split or a merge. cleans up after that client.
+ *
+ * @param idata: the index data parsed from the index entry left by the dead
+ * client.
+ * @param error: the error that caused the client to realize the other client
+ * died (should be -ENOENT or -ETIMEDOUT)
+ * @post: rolls forward if -ENOENT, otherwise rolls back.
+ */
+ int cleanup(const index_data &idata, const int &error);
+
+ /**
+ * does the ObjectWriteOperation and splits, reads the index, and/or retries
+ * until success.
+ */
+ int set_op(const string &key, const bufferlist &val,
+ bool update_on_existing, index_data &idata);
+
+ /**
+ * does the ObjectWriteOperation and merges, reads the index, and/or retries
+ * until success.
+ */
+ int remove_op(const string &key, index_data &idata, index_data &next_idata);
+
+ /**
+ * does the ObjectWriteOperation and reads the index and/or retries
+ * until success.
+ */
+ int get_op(const string &key, bufferlist * val, index_data &idata);
+
+ /**
+ * does the ObjectWriteOperation and splits, reads the index, and/or retries
+ * until success.
+ */
+ int handle_set_rm_errors(int &err, string key, string obj,
+ index_data * idata, index_data * next_idata);
+
+ /**
+ * called by aio_set, aio_remove, and aio_get, respectively.
+ */
+ static void* pset(void *ptr);
+ static void* prm(void *ptr);
+ static void* pget(void *ptr);
+public:
+
+
+ //interruption methods, for correctness testing
+ /**
+ * returns 0
+ */
+ int nothing() override;
+ /**
+ * 10% chance of waiting wait_ms seconds
+ */
+ int wait() override;
+ /**
+ * 10% chance of killing the client.
+ */
+ int suicide() override;
+
+KvFlatBtreeAsync(int k_val, string name, int cache, double cache_r,
+ bool verb)
+ : k(k_val),
+ index_name("index_object"),
+ rados_id(name),
+ client_name(string(name).append(".")),
+ pool_name("rbd"),
+ interrupt(&KeyValueStructure::nothing),
+ wait_ms(0),
+ timeout(100000,0),
+ cache_size(cache),
+ cache_refresh(cache_r),
+ verbose(verb),
+ client_index_lock("client_index_lock"),
+ client_index(0),
+ icache_lock("icache_lock"),
+ icache(cache)
+ {}
+
+ /**
+ * creates a string with an int at the end.
+ *
+ * @param s: the string on the left
+ * @param i: the int to be appended to the string
+ * @return the string
+ */
+ static string to_string(string s, int i);
+
+ /**
+ * returns in encoded
+ */
+ static bufferlist to_bl(const string &in) {
+ bufferlist bl;
+ bl.append(in);
+ return bl;
+ }
+
+ /**
+ * returns idata encoded;
+ */
+ static bufferlist to_bl(const index_data &idata) {
+ bufferlist bl;
+ idata.encode(bl);
+ return bl;
+ }
+
+ /**
+ * returns the rados_id of this KvFlatBtreeAsync
+ */
+ string get_name();
+
+ /**
+ * sets this kvba to call inject before every ObjectWriteOperation.
+ * If inject is wait and wait_time is set, wait will have a 10% chance of
+ * sleeping for waite_time milliseconds.
+ */
+ void set_inject(injection_t inject, int wait_time) override;
+
+ /**
+ * sets up the rados and io_ctx of this KvFlatBtreeAsync. If the don't already
+ * exist, creates the index and max object.
+ */
+ int setup(int argc, const char** argv) override;
+
+ int set(const string &key, const bufferlist &val,
+ bool update_on_existing) override;
+
+ int remove(const string &key) override;
+
+ /**
+ * returns true if all of the following are true:
+ *
+ * all objects are accounted for in the index or a prefix
+ * (i.e., no floating objects)
+ * all objects have k <= size <= 2k
+ * all keys in an object are within the specified predicted by the index
+ *
+ * if any of those fails, states that the problem(s) are, and prints str().
+ *
+ * @pre: no operations are in progress
+ */
+ bool is_consistent() override;
+
+ /**
+ * returns an ASCII representation of the index and sub objects, showing
+ * stats about each object and all omaps. Don't use if you have more than
+ * about 10 objects.
+ */
+ string str() override;
+
+ int get(const string &key, bufferlist *val) override;
+
+ //async versions of these methods
+ void aio_get(const string &key, bufferlist *val, callback cb,
+ void *cb_args, int * err) override;
+ void aio_set(const string &key, const bufferlist &val, bool exclusive,
+ callback cb, void * cb_args, int * err) override;
+ void aio_remove(const string &key, callback cb, void *cb_args, int * err) override;
+
+ //these methods that deal with multiple keys at once are efficient, but make
+ //no guarantees about atomicity!
+
+ /**
+ * Removes all objects and resets the store as if setup had just run. Makes no
+ * attempt to do this safely - make sure this is the only operation running
+ * when it is called!
+ */
+ int remove_all() override;
+
+ /**
+ * This does not add prefixes to the index and therefore DOES NOT guarantee
+ * consistency! It is ONLY safe if there is only one instance at a time.
+ * It follows the same general logic as a rebalance, but
+ * with all objects that contain any of the keys in in_map. It is O(n), where
+ * n is the number of librados objects it has to change. Higher object sizes
+ * (i.e., k values) also decrease the efficiency of this method because it
+ * copies all of the entries in each object it modifies. Writing new objects
+ * is done in parallel.
+ *
+ * This is efficient if:
+ * * other clients are very unlikely to be modifying any of the objects while
+ * this operation is in progress
+ * * The entries in in_map are close together
+ * * It is especially efficient for initially entering lots of entries into
+ * an empty structure.
+ *
+ * It is very inefficient compared to setting one key and/or will starve if:
+ * * other clients are modifying the objects it tries to modify
+ * * The keys are distributed across the range of keys in the store
+ * * there is a small number of keys compared to k
+ */
+ int set_many(const map<string, bufferlist> &in_map) override;
+
+ int get_all_keys(std::set<string> *keys) override;
+ int get_all_keys_and_values(map<string,bufferlist> *kv_map) override;
+
+};
+
+#endif /* KVFLATBTREEASYNC_H_ */
diff --git a/src/key_value_store/kvs_arg_types.h b/src/key_value_store/kvs_arg_types.h
new file mode 100644
index 00000000..9798aec7
--- /dev/null
+++ b/src/key_value_store/kvs_arg_types.h
@@ -0,0 +1,144 @@
+/*
+ * Argument types used by cls_kvs.cc
+ *
+ * Created on: Aug 10, 2012
+ * Author: eleanor
+ */
+
+#ifndef CLS_KVS_H_
+#define CLS_KVS_H_
+
+#define EBALANCE 137
+
+#include "include/encoding.h"
+#include "key_value_store/kv_flat_btree_async.h"
+
+using ceph::bufferlist;
+
+struct assert_size_args {
+ uint64_t bound; //the size to compare to - should be k or 2k
+ uint64_t comparator; //should be CEPH_OSD_CMPXATTR_OP_EQ,
+ //CEPH_OSD_CMPXATTR_OP_LT, or
+ //CEPH_OSD_CMPXATTR_OP_GT
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(bound, bl);
+ encode(comparator, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(bound, p);
+ decode(comparator, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(assert_size_args)
+
+struct idata_from_key_args {
+ string key;
+ index_data idata;
+ index_data next_idata;
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(key, bl);
+ encode(idata, bl);
+ encode(next_idata, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(key, p);
+ decode(idata, p);
+ decode(next_idata, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(idata_from_key_args)
+
+struct idata_from_idata_args {
+ index_data idata;
+ index_data next_idata;
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(idata, bl);
+ encode(next_idata, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(idata, p);
+ decode(next_idata, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(idata_from_idata_args)
+
+struct omap_set_args {
+ map<string, bufferlist> omap;
+ uint64_t bound;
+ bool exclusive;
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(omap, bl);
+ encode(bound, bl);
+ encode(exclusive, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(omap, p);
+ decode(bound, p);
+ decode(exclusive, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(omap_set_args)
+
+struct omap_rm_args {
+ std::set<string> omap;
+ uint64_t bound;
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(omap, bl);
+ encode(bound, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(omap, p);
+ decode(bound, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(omap_rm_args)
+
+struct rebalance_args {
+ object_data odata;
+ uint64_t bound;
+ uint64_t comparator;
+
+ void encode(bufferlist &bl) const {
+ ENCODE_START(1,1,bl);
+ encode(odata, bl);
+ encode(bound, bl);
+ encode(comparator, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ DECODE_START(1, p);
+ decode(odata,p);
+ decode(bound, p);
+ decode(comparator, p);
+ DECODE_FINISH(p);
+ }
+};
+WRITE_CLASS_ENCODER(rebalance_args)
+
+
+#endif /* CLS_KVS_H_ */