summaryrefslogtreecommitdiffstats
path: root/src/rgw/rgw_putobj_processor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/rgw/rgw_putobj_processor.cc')
-rw-r--r--src/rgw/rgw_putobj_processor.cc670
1 files changed, 670 insertions, 0 deletions
diff --git a/src/rgw/rgw_putobj_processor.cc b/src/rgw/rgw_putobj_processor.cc
new file mode 100644
index 00000000..3de30a82
--- /dev/null
+++ b/src/rgw/rgw_putobj_processor.cc
@@ -0,0 +1,670 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_aio.h"
+#include "rgw_putobj_processor.h"
+#include "rgw_multi.h"
+#include "services/svc_sys_obj.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::putobj {
+
+int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset)
+{
+ const bool flush = (data.length() == 0);
+
+ // capture the first chunk for special handling
+ if (data_offset < head_chunk_size || data_offset == 0) {
+ if (flush) {
+ // flush partial chunk
+ return process_first_chunk(std::move(head_data), &processor);
+ }
+
+ auto remaining = head_chunk_size - data_offset;
+ auto count = std::min<uint64_t>(data.length(), remaining);
+ data.splice(0, count, &head_data);
+ data_offset += count;
+
+ if (data_offset == head_chunk_size) {
+ // process the first complete chunk
+ ceph_assert(head_data.length() == head_chunk_size);
+ int r = process_first_chunk(std::move(head_data), &processor);
+ if (r < 0) {
+ return r;
+ }
+ }
+ if (data.length() == 0) { // avoid flushing stripe processor
+ return 0;
+ }
+ }
+ ceph_assert(processor); // process_first_chunk() must initialize
+
+ // send everything else through the processor
+ auto write_offset = data_offset;
+ data_offset += data.length();
+ return processor->process(std::move(data), write_offset);
+}
+
+
+static int process_completed(const AioResultList& completed, RawObjSet *written)
+{
+ std::optional<int> error;
+ for (auto& r : completed) {
+ if (r.result >= 0) {
+ written->insert(r.obj.get_ref().obj);
+ } else if (!error) { // record first error code
+ error = r.result;
+ }
+ }
+ return error.value_or(0);
+}
+
+int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj)
+{
+ stripe_obj = store->svc.rados->obj(raw_obj);
+ return stripe_obj.open();
+}
+
+int RadosWriter::process(bufferlist&& bl, uint64_t offset)
+{
+ bufferlist data = std::move(bl);
+ const uint64_t cost = data.length();
+ if (cost == 0) { // no empty writes, use aio directly for creates
+ return 0;
+ }
+ librados::ObjectWriteOperation op;
+ if (offset == 0) {
+ op.write_full(data);
+ } else {
+ op.write(offset, data);
+ }
+ constexpr uint64_t id = 0; // unused
+ auto c = aio->submit(stripe_obj, &op, cost, id);
+ return process_completed(c, &written);
+}
+
+int RadosWriter::write_exclusive(const bufferlist& data)
+{
+ const uint64_t cost = data.length();
+
+ librados::ObjectWriteOperation op;
+ op.create(true); // exclusive create
+ op.write_full(data);
+
+ constexpr uint64_t id = 0; // unused
+ auto c = aio->submit(stripe_obj, &op, cost, id);
+ auto d = aio->drain();
+ c.splice(c.end(), d);
+ return process_completed(c, &written);
+}
+
+int RadosWriter::drain()
+{
+ return process_completed(aio->drain(), &written);
+}
+
+RadosWriter::~RadosWriter()
+{
+ // wait on any outstanding aio completions
+ process_completed(aio->drain(), &written);
+
+ bool need_to_remove_head = false;
+ std::optional<rgw_raw_obj> raw_head;
+ if (!head_obj.empty()) {
+ raw_head.emplace();
+ store->obj_to_raw(bucket_info.placement_rule, head_obj, &*raw_head);
+ }
+
+ /**
+ * We should delete the object in the "multipart" namespace to avoid race condition.
+ * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
+ * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
+ * written by the second upload may be deleted by the first upload.
+ * details is describled on #11749
+ *
+ * The above comment still stands, but instead of searching for a specific object in the multipart
+ * namespace, we just make sure that we remove the object that is marked as the head object after
+ * we remove all the other raw objects. Note that we use different call to remove the head object,
+ * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
+ */
+ for (const auto& obj : written) {
+ if (raw_head && obj == *raw_head) {
+ ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
+ need_to_remove_head = true;
+ continue;
+ }
+
+ int r = store->delete_raw_obj(obj);
+ if (r < 0 && r != -ENOENT) {
+ ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
+ }
+ }
+
+ if (need_to_remove_head) {
+ ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl;
+ int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
+ if (r < 0 && r != -ENOENT) {
+ ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl;
+ }
+ }
+}
+
+
+// advance to the next stripe
+int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size)
+{
+ // advance the manifest
+ int r = manifest_gen.create_next(offset);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+ uint64_t chunk_size = 0;
+ r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size);
+ if (r < 0) {
+ return r;
+ }
+ r = writer.set_stripe_obj(stripe_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ chunk = ChunkProcessor(&writer, chunk_size);
+ *pstripe_size = manifest_gen.cur_stripe_max_size();
+ return 0;
+}
+
+
+int AtomicObjectProcessor::process_first_chunk(bufferlist&& data,
+ DataProcessor **processor)
+{
+ first_chunk = std::move(data);
+ *processor = &stripe;
+ return 0;
+}
+
+int AtomicObjectProcessor::prepare()
+{
+ uint64_t max_head_chunk_size;
+ uint64_t head_max_size;
+ uint64_t chunk_size = 0;
+ uint64_t alignment;
+ rgw_pool head_pool;
+
+ if (!store->get_obj_data_pool(bucket_info.placement_rule, head_obj, &head_pool)) {
+ return -EIO;
+ }
+
+ int r = store->get_max_chunk_size(head_pool, &max_head_chunk_size, &alignment);
+ if (r < 0) {
+ return r;
+ }
+
+ bool same_pool = true;
+
+ if (bucket_info.placement_rule != tail_placement_rule) {
+ rgw_pool tail_pool;
+ if (!store->get_obj_data_pool(tail_placement_rule, head_obj, &tail_pool)) {
+ return -EIO;
+ }
+
+ if (tail_pool != head_pool) {
+ same_pool = false;
+
+ r = store->get_max_chunk_size(tail_pool, &chunk_size);
+ if (r < 0) {
+ return r;
+ }
+
+ head_max_size = 0;
+ }
+ }
+
+ if (same_pool) {
+ head_max_size = max_head_chunk_size;
+ chunk_size = max_head_chunk_size;
+ }
+
+ uint64_t stripe_size;
+ const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+
+ store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size);
+
+ manifest.set_trivial_rule(head_max_size, stripe_size);
+
+ r = manifest_gen.create_begin(store->ctx(), &manifest,
+ bucket_info.placement_rule,
+ &tail_placement_rule,
+ head_obj.bucket, head_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+ r = writer.set_stripe_obj(stripe_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ set_head_chunk_size(head_max_size);
+ // initialize the processors
+ chunk = ChunkProcessor(&writer, chunk_size);
+ stripe = StripeProcessor(&chunk, this, head_max_size);
+ return 0;
+}
+
+int AtomicObjectProcessor::complete(size_t accounted_size,
+ const std::string& etag,
+ ceph::real_time *mtime,
+ ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match,
+ const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace,
+ bool *pcanceled)
+{
+ int r = writer.drain();
+ if (r < 0) {
+ return r;
+ }
+ const uint64_t actual_size = get_actual_size();
+ r = manifest_gen.create_next(actual_size);
+ if (r < 0) {
+ return r;
+ }
+
+ obj_ctx.set_atomic(head_obj);
+
+ RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
+
+ /* some object types shouldn't be versioned, e.g., multipart parts */
+ op_target.set_versioning_disabled(!bucket_info.versioning_enabled());
+
+ RGWRados::Object::Write obj_op(&op_target);
+
+ obj_op.meta.data = &first_chunk;
+ obj_op.meta.manifest = &manifest;
+ obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+ obj_op.meta.if_match = if_match;
+ obj_op.meta.if_nomatch = if_nomatch;
+ obj_op.meta.mtime = mtime;
+ obj_op.meta.set_mtime = set_mtime;
+ obj_op.meta.owner = owner;
+ obj_op.meta.flags = PUT_OBJ_CREATE;
+ obj_op.meta.olh_epoch = olh_epoch;
+ obj_op.meta.delete_at = delete_at;
+ obj_op.meta.user_data = user_data;
+ obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
+
+ r = obj_op.write_meta(actual_size, accounted_size, attrs);
+ if (r < 0) {
+ return r;
+ }
+ if (!obj_op.meta.canceled) {
+ // on success, clear the set of objects for deletion
+ writer.clear_written();
+ }
+ if (pcanceled) {
+ *pcanceled = obj_op.meta.canceled;
+ }
+ return 0;
+}
+
+
+int MultipartObjectProcessor::process_first_chunk(bufferlist&& data,
+ DataProcessor **processor)
+{
+ // write the first chunk of the head object as part of an exclusive create,
+ // then drain to wait for the result in case of EEXIST
+ int r = writer.write_exclusive(data);
+ if (r == -EEXIST) {
+ // randomize the oid prefix and reprepare the head/manifest
+ std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32);
+
+ mp.init(target_obj.key.name, upload_id, oid_rand);
+ manifest.set_prefix(target_obj.key.name + "." + oid_rand);
+
+ r = prepare_head();
+ if (r < 0) {
+ return r;
+ }
+ // resubmit the write op on the new head object
+ r = writer.write_exclusive(data);
+ }
+ if (r < 0) {
+ return r;
+ }
+ *processor = &stripe;
+ return 0;
+}
+
+int MultipartObjectProcessor::prepare_head()
+{
+ const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+ uint64_t chunk_size;
+ uint64_t stripe_size;
+ uint64_t alignment;
+
+ int r = store->get_max_chunk_size(tail_placement_rule, target_obj, &chunk_size, &alignment);
+ if (r < 0) {
+ ldout(store->ctx(), 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl;
+ return r;
+ }
+ store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size);
+
+ manifest.set_multipart_part_rule(stripe_size, part_num);
+
+ r = manifest_gen.create_begin(store->ctx(), &manifest,
+ bucket_info.placement_rule,
+ &tail_placement_rule,
+ target_obj.bucket, target_obj);
+ if (r < 0) {
+ return r;
+ }
+
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+ rgw_raw_obj_to_obj(head_obj.bucket, stripe_obj, &head_obj);
+ head_obj.index_hash_source = target_obj.key.name;
+
+ r = writer.set_stripe_obj(stripe_obj);
+ if (r < 0) {
+ return r;
+ }
+ stripe_size = manifest_gen.cur_stripe_max_size();
+ set_head_chunk_size(stripe_size);
+
+ chunk = ChunkProcessor(&writer, chunk_size);
+ stripe = StripeProcessor(&chunk, this, stripe_size);
+ return 0;
+}
+
+int MultipartObjectProcessor::prepare()
+{
+ manifest.set_prefix(target_obj.key.name + "." + upload_id);
+
+ return prepare_head();
+}
+
+int MultipartObjectProcessor::complete(size_t accounted_size,
+ const std::string& etag,
+ ceph::real_time *mtime,
+ ceph::real_time set_mtime,
+ std::map<std::string, bufferlist>& attrs,
+ ceph::real_time delete_at,
+ const char *if_match,
+ const char *if_nomatch,
+ const std::string *user_data,
+ rgw_zone_set *zones_trace,
+ bool *pcanceled)
+{
+ int r = writer.drain();
+ if (r < 0) {
+ return r;
+ }
+ const uint64_t actual_size = get_actual_size();
+ r = manifest_gen.create_next(actual_size);
+ if (r < 0) {
+ return r;
+ }
+
+ RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
+ op_target.set_versioning_disabled(true);
+ RGWRados::Object::Write obj_op(&op_target);
+
+ obj_op.meta.set_mtime = set_mtime;
+ obj_op.meta.mtime = mtime;
+ obj_op.meta.owner = owner;
+ obj_op.meta.delete_at = delete_at;
+ obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
+
+ r = obj_op.write_meta(actual_size, accounted_size, attrs);
+ if (r < 0)
+ return r;
+
+ bufferlist bl;
+ RGWUploadPartInfo info;
+ string p = "part.";
+ bool sorted_omap = is_v2_upload_id(upload_id);
+
+ if (sorted_omap) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%08d", part_num);
+ p.append(buf);
+ } else {
+ p.append(part_num_str);
+ }
+ info.num = part_num;
+ info.etag = etag;
+ info.size = actual_size;
+ info.accounted_size = accounted_size;
+ info.modified = real_clock::now();
+ info.manifest = manifest;
+
+ bool compressed;
+ r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info);
+ if (r < 0) {
+ ldout(store->ctx(), 1) << "cannot get compression info" << dendl;
+ return r;
+ }
+
+ encode(info, bl);
+
+ rgw_obj meta_obj;
+ meta_obj.init_ns(bucket_info.bucket, mp.get_meta(), RGW_OBJ_NS_MULTIPART);
+ meta_obj.set_in_extra_data(true);
+
+ rgw_raw_obj raw_meta_obj;
+
+ store->obj_to_raw(bucket_info.placement_rule, meta_obj, &raw_meta_obj);
+
+ auto obj_ctx = store->svc.sysobj->init_obj_ctx();
+ auto sysobj = obj_ctx.get_obj(raw_meta_obj);
+
+ r = sysobj.omap()
+ .set_must_exist(true)
+ .set(p, bl);
+ if (r < 0) {
+ return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r;
+ }
+
+ if (!obj_op.meta.canceled) {
+ // on success, clear the set of objects for deletion
+ writer.clear_written();
+ }
+ if (pcanceled) {
+ *pcanceled = obj_op.meta.canceled;
+ }
+ return 0;
+}
+
+int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::putobj::DataProcessor **processor)
+{
+ int r = writer.write_exclusive(data);
+ if (r < 0) {
+ return r;
+ }
+ *processor = &stripe;
+ return 0;
+}
+
+int AppendObjectProcessor::prepare()
+{
+ RGWObjState *astate;
+ int r = store->get_obj_state(&obj_ctx, bucket_info, head_obj, &astate);
+ if (r < 0) {
+ return r;
+ }
+ cur_size = astate->size;
+ *cur_accounted_size = astate->accounted_size;
+ if (!astate->exists) {
+ if (position != 0) {
+ ldout(store->ctx(), 5) << "ERROR: Append position should be zero" << dendl;
+ return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+ } else {
+ cur_part_num = 1;
+ //set the prefix
+ char buf[33];
+ gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+ string oid_prefix = head_obj.key.name;
+ oid_prefix.append(".");
+ oid_prefix.append(buf);
+ oid_prefix.append("_");
+ manifest.set_prefix(oid_prefix);
+ }
+ } else {
+ // check whether the object appendable
+ map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
+ if (iter == astate->attrset.end()) {
+ ldout(store->ctx(), 5) << "ERROR: The object is not appendable" << dendl;
+ return -ERR_OBJECT_NOT_APPENDABLE;
+ }
+ if (position != *cur_accounted_size) {
+ ldout(store->ctx(), 5) << "ERROR: Append position should be equal to the obj size" << dendl;
+ return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+ }
+ try {
+ decode(cur_part_num, iter->second);
+ } catch (buffer::error& err) {
+ ldout(store->ctx(), 5) << "ERROR: failed to decode part num" << dendl;
+ return -EIO;
+ }
+ cur_part_num++;
+ //get the current obj etag
+ iter = astate->attrset.find(RGW_ATTR_ETAG);
+ if (iter != astate->attrset.end()) {
+ string s = rgw_string_unquote(iter->second.c_str());
+ size_t pos = s.find("-");
+ cur_etag = s.substr(0, pos);
+ }
+
+ iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
+ if (iter != astate->attrset.end()) {
+ tail_placement_rule.storage_class = iter->second.to_str();
+ }
+ cur_manifest = &astate->manifest;
+ manifest.set_prefix(cur_manifest->get_prefix());
+ astate->keep_tail = true;
+ }
+ manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num);
+
+ r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, &tail_placement_rule, head_obj.bucket, head_obj);
+ if (r < 0) {
+ return r;
+ }
+ rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+ uint64_t chunk_size = 0;
+ r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size);
+ if (r < 0) {
+ return r;
+ }
+ r = writer.set_stripe_obj(std::move(stripe_obj));
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t stripe_size = manifest_gen.cur_stripe_max_size();
+
+ uint64_t max_head_size = std::min(chunk_size, stripe_size);
+ set_head_chunk_size(max_head_size);
+
+ // initialize the processors
+ chunk = ChunkProcessor(&writer, chunk_size);
+ stripe = StripeProcessor(&chunk, this, stripe_size);
+
+ return 0;
+}
+
+int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime,
+ ceph::real_time set_mtime, map <string, bufferlist> &attrs,
+ ceph::real_time delete_at, const char *if_match, const char *if_nomatch,
+ const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled)
+{
+ int r = writer.drain();
+ if (r < 0)
+ return r;
+ const uint64_t actual_size = get_actual_size();
+ r = manifest_gen.create_next(actual_size);
+ if (r < 0) {
+ return r;
+ }
+ obj_ctx.set_atomic(head_obj);
+ RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
+ //For Append obj, disable versioning
+ op_target.set_versioning_disabled(true);
+ RGWRados::Object::Write obj_op(&op_target);
+ if (cur_manifest) {
+ cur_manifest->append(manifest, store->svc.zone);
+ obj_op.meta.manifest = cur_manifest;
+ } else {
+ obj_op.meta.manifest = &manifest;
+ }
+ obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+ obj_op.meta.mtime = mtime;
+ obj_op.meta.set_mtime = set_mtime;
+ obj_op.meta.owner = owner;
+ obj_op.meta.flags = PUT_OBJ_CREATE;
+ obj_op.meta.delete_at = delete_at;
+ obj_op.meta.user_data = user_data;
+ obj_op.meta.zones_trace = zones_trace;
+ obj_op.meta.modify_tail = true;
+ obj_op.meta.appendable = true;
+ //Add the append part number
+ bufferlist cur_part_num_bl;
+ encode(cur_part_num, cur_part_num_bl);
+ attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl;
+ //calculate the etag
+ if (!cur_etag.empty()) {
+ MD5 hash;
+ char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+ char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+ hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+ hash.Update((const unsigned char *)petag, sizeof(petag));
+ hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+ hash.Update((const unsigned char *)petag, sizeof(petag));
+ hash.Final((unsigned char *)final_etag);
+ buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+ snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+ "-%lld", (long long)cur_part_num);
+ bufferlist etag_bl;
+ etag_bl.append(final_etag_str, strlen(final_etag_str) + 1);
+ attrs[RGW_ATTR_ETAG] = etag_bl;
+ }
+ r = obj_op.write_meta(actual_size + cur_size, accounted_size + *cur_accounted_size, attrs);
+ if (r < 0) {
+ return r;
+ }
+ if (!obj_op.meta.canceled) {
+ // on success, clear the set of objects for deletion
+ writer.clear_written();
+ }
+ if (pcanceled) {
+ *pcanceled = obj_op.meta.canceled;
+ }
+ *cur_accounted_size += accounted_size;
+
+ return 0;
+}
+
+} // namespace rgw::putobj