summaryrefslogtreecommitdiffstats
path: root/src/os/filestore/FileJournal.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/os/filestore/FileJournal.h')
-rw-r--r--src/os/filestore/FileJournal.h556
1 files changed, 556 insertions, 0 deletions
diff --git a/src/os/filestore/FileJournal.h b/src/os/filestore/FileJournal.h
new file mode 100644
index 00000000..2313b4b8
--- /dev/null
+++ b/src/os/filestore/FileJournal.h
@@ -0,0 +1,556 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILEJOURNAL_H
+#define CEPH_FILEJOURNAL_H
+
+#include <stdlib.h>
+#include <deque>
+using std::deque;
+
+#include "Journal.h"
+#include "common/config_fwd.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+#include "common/Throttle.h"
+#include "JournalThrottle.h"
+#include "common/zipkin_trace.h"
+
+#ifdef HAVE_LIBAIO
+# include <libaio.h>
+#endif
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+/**
+ * Implements journaling on top of block device or file.
+ *
+ * Lock ordering is write_lock > aio_lock > (completions_lock | finisher_lock)
+ */
+class FileJournal :
+ public Journal,
+ public md_config_obs_t {
+public:
+ /// Protected by finisher_lock
+ struct completion_item {
+ uint64_t seq;
+ Context *finish;
+ utime_t start;
+ TrackedOpRef tracked_op;
+ completion_item(uint64_t o, Context *c, utime_t s, TrackedOpRef opref)
+ : seq(o), finish(c), start(s), tracked_op(opref) {}
+ completion_item() : seq(0), finish(0), start(0) {}
+ };
+ struct write_item {
+ uint64_t seq;
+ bufferlist bl;
+ uint32_t orig_len;
+ TrackedOpRef tracked_op;
+ ZTracer::Trace trace;
+ write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) :
+ seq(s), orig_len(ol), tracked_op(opref) {
+ bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+ }
+ write_item() : seq(0), orig_len(0) {}
+ };
+
+ Mutex finisher_lock;
+ Cond finisher_cond;
+ uint64_t journaled_seq;
+ bool plug_journal_completions;
+
+ Mutex writeq_lock;
+ Cond writeq_cond;
+ list<write_item> writeq;
+ bool writeq_empty();
+ write_item &peek_write();
+ void pop_write();
+ void batch_pop_write(list<write_item> &items);
+ void batch_unpop_write(list<write_item> &items);
+
+ Mutex completions_lock;
+ list<completion_item> completions;
+ bool completions_empty() {
+ Mutex::Locker l(completions_lock);
+ return completions.empty();
+ }
+ void batch_pop_completions(list<completion_item> &items) {
+ Mutex::Locker l(completions_lock);
+ completions.swap(items);
+ }
+ void batch_unpop_completions(list<completion_item> &items) {
+ Mutex::Locker l(completions_lock);
+ completions.splice(completions.begin(), items);
+ }
+ completion_item completion_peek_front() {
+ Mutex::Locker l(completions_lock);
+ ceph_assert(!completions.empty());
+ return completions.front();
+ }
+ void completion_pop_front() {
+ Mutex::Locker l(completions_lock);
+ ceph_assert(!completions.empty());
+ completions.pop_front();
+ }
+
+ int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) override;
+
+ void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len,
+ Context *oncommit,
+ TrackedOpRef osd_op = TrackedOpRef()) override;
+ /// End protected by finisher_lock
+
+ /*
+ * journal header
+ */
+ struct header_t {
+ enum {
+ FLAG_CRC = (1<<0),
+ // NOTE: remove kludgey weirdness in read_header() next time a flag is added.
+ };
+
+ uint64_t flags;
+ uuid_d fsid;
+ __u32 block_size;
+ __u32 alignment;
+ int64_t max_size; // max size of journal ring buffer
+ int64_t start; // offset of first entry
+ uint64_t committed_up_to; // committed up to
+
+ /**
+ * start_seq
+ *
+ * entry at header.start has sequence >= start_seq
+ *
+ * Generally, the entry at header.start will have sequence
+ * start_seq if it exists. The only exception is immediately
+ * after journal creation since the first sequence number is
+ * not known.
+ *
+ * If the first read on open fails, we can assume corruption
+ * if start_seq > committed_up_to because the entry would have
+ * a sequence >= start_seq and therefore > committed_up_to.
+ */
+ uint64_t start_seq;
+
+ header_t() :
+ flags(0), block_size(0), alignment(0), max_size(0), start(0),
+ committed_up_to(0), start_seq(0) {}
+
+ void clear() {
+ start = block_size;
+ }
+
+ uint64_t get_fsid64() const {
+ return *(uint64_t*)fsid.bytes();
+ }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ __u32 v = 4;
+ encode(v, bl);
+ bufferlist em;
+ {
+ encode(flags, em);
+ encode(fsid, em);
+ encode(block_size, em);
+ encode(alignment, em);
+ encode(max_size, em);
+ encode(start, em);
+ encode(committed_up_to, em);
+ encode(start_seq, em);
+ }
+ encode(em, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ __u32 v;
+ decode(v, bl);
+ if (v < 2) { // normally 0, but conceivably 1
+ // decode old header_t struct (pre v0.40).
+ bl.advance(4u); // skip __u32 flags (it was unused by any old code)
+ flags = 0;
+ uint64_t tfsid;
+ decode(tfsid, bl);
+ *(uint64_t*)&fsid.bytes()[0] = tfsid;
+ *(uint64_t*)&fsid.bytes()[8] = tfsid;
+ decode(block_size, bl);
+ decode(alignment, bl);
+ decode(max_size, bl);
+ decode(start, bl);
+ committed_up_to = 0;
+ start_seq = 0;
+ return;
+ }
+ bufferlist em;
+ decode(em, bl);
+ auto t = em.cbegin();
+ decode(flags, t);
+ decode(fsid, t);
+ decode(block_size, t);
+ decode(alignment, t);
+ decode(max_size, t);
+ decode(start, t);
+
+ if (v > 2)
+ decode(committed_up_to, t);
+ else
+ committed_up_to = 0;
+
+ if (v > 3)
+ decode(start_seq, t);
+ else
+ start_seq = 0;
+ }
+ } header;
+
+ struct entry_header_t {
+ uint64_t seq; // fs op seq #
+ uint32_t crc32c; // payload only. not header, pre_pad, post_pad, or footer.
+ uint32_t len;
+ uint32_t pre_pad, post_pad;
+ uint64_t magic1;
+ uint64_t magic2;
+
+ static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) {
+ return (fsid ^ seq ^ len);
+ }
+ bool check_magic(off64_t pos, uint64_t fsid) {
+ return
+ magic1 == (uint64_t)pos &&
+ magic2 == (fsid ^ seq ^ len);
+ }
+ } __attribute__((__packed__, aligned(4)));
+
+ bool journalq_empty() { return journalq.empty(); }
+
+private:
+ string fn;
+
+ char *zero_buf;
+ off64_t max_size;
+ size_t block_size;
+ bool directio, aio, force_aio;
+ bool must_write_header;
+ off64_t write_pos; // byte where the next entry to be written will go
+ off64_t read_pos; //
+ bool discard; //for block journal whether support discard
+
+#ifdef HAVE_LIBAIO
+ /// state associated with an in-flight aio request
+ /// Protected by aio_lock
+ struct aio_info {
+ struct iocb iocb {};
+ bufferlist bl;
+ struct iovec *iov;
+ bool done;
+ uint64_t off, len; ///< these are for debug only
+ uint64_t seq; ///< seq number to complete on aio completion, if non-zero
+
+ aio_info(bufferlist& b, uint64_t o, uint64_t s)
+ : iov(NULL), done(false), off(o), len(b.length()), seq(s) {
+ bl.claim(b);
+ }
+ ~aio_info() {
+ delete[] iov;
+ }
+ };
+ Mutex aio_lock;
+ Cond aio_cond;
+ Cond write_finish_cond;
+ io_context_t aio_ctx;
+ list<aio_info> aio_queue;
+ int aio_num, aio_bytes;
+ uint64_t aio_write_queue_ops;
+ uint64_t aio_write_queue_bytes;
+ /// End protected by aio_lock
+#endif
+
+ uint64_t last_committed_seq;
+ uint64_t journaled_since_start;
+
+ string devname;
+
+ /*
+ * full states cycle at the beginnging of each commit epoch, when commit_start()
+ * is called.
+ * FULL - we just filled up during this epoch.
+ * WAIT - we filled up last epoch; now we have to wait until everything during
+ * that epoch commits to the fs before we can start writing over it.
+ * NOTFULL - all good, journal away.
+ */
+ enum {
+ FULL_NOTFULL = 0,
+ FULL_FULL = 1,
+ FULL_WAIT = 2,
+ } full_state;
+
+ int fd;
+
+ // in journal
+ deque<pair<uint64_t, off64_t> > journalq; // track seq offsets, so we can trim later.
+ uint64_t writing_seq;
+
+
+ // throttle
+ int set_throttle_params();
+ const char** get_tracked_conf_keys() const override;
+ void handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set <std::string> &changed) override {
+ for (const char **i = get_tracked_conf_keys();
+ *i;
+ ++i) {
+ if (changed.count(string(*i))) {
+ set_throttle_params();
+ return;
+ }
+ }
+ }
+
+ void complete_write(uint64_t ops, uint64_t bytes);
+ JournalThrottle throttle;
+
+ // write thread
+ Mutex write_lock;
+ bool write_stop;
+ bool aio_stop;
+
+ Cond commit_cond;
+
+ int _open(bool wr, bool create=false);
+ int _open_block_device();
+ void _close(int fd) const;
+ int _open_file(int64_t oldsize, blksize_t blksize, bool create);
+ int _dump(ostream& out, bool simple);
+ void print_header(const header_t &hdr) const;
+ int read_header(header_t *hdr) const;
+ bufferptr prepare_header();
+ void start_writer();
+ void stop_writer();
+ void write_thread_entry();
+
+ void queue_completions_thru(uint64_t seq);
+
+ int check_for_full(uint64_t seq, off64_t pos, off64_t size);
+ int prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytee);
+ int prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos,
+ uint64_t& orig_ops, uint64_t& orig_bytes);
+ void do_write(bufferlist& bl);
+
+ void write_finish_thread_entry();
+ void check_aio_completion();
+ void do_aio_write(bufferlist& bl);
+ int write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq);
+
+
+ void check_align(off64_t pos, bufferlist& bl);
+ int write_bl(off64_t& pos, bufferlist& bl);
+
+ /// read len from journal starting at in_pos and wrapping up to len
+ void wrap_read_bl(
+ off64_t in_pos, ///< [in] start position
+ int64_t len, ///< [in] length to read
+ bufferlist* bl, ///< [out] result
+ off64_t *out_pos ///< [out] next position to read, will be wrapped
+ ) const;
+
+ void do_discard(int64_t offset, int64_t end);
+
+ class Writer : public Thread {
+ FileJournal *journal;
+ public:
+ explicit Writer(FileJournal *fj) : journal(fj) {}
+ void *entry() override {
+ journal->write_thread_entry();
+ return 0;
+ }
+ } write_thread;
+
+ class WriteFinisher : public Thread {
+ FileJournal *journal;
+ public:
+ explicit WriteFinisher(FileJournal *fj) : journal(fj) {}
+ void *entry() override {
+ journal->write_finish_thread_entry();
+ return 0;
+ }
+ } write_finish_thread;
+
+ off64_t get_top() const {
+ return round_up_to(sizeof(header), block_size);
+ }
+
+ ZTracer::Endpoint trace_endpoint;
+
+ public:
+ FileJournal(CephContext* cct, uuid_d fsid, Finisher *fin, Cond *sync_cond,
+ const char *f, bool dio=false, bool ai=true, bool faio=false) :
+ Journal(cct, fsid, fin, sync_cond),
+ finisher_lock("FileJournal::finisher_lock", false, true, false),
+ journaled_seq(0),
+ plug_journal_completions(false),
+ writeq_lock("FileJournal::writeq_lock", false, true, false),
+ completions_lock(
+ "FileJournal::completions_lock", false, true, false),
+ fn(f),
+ zero_buf(NULL),
+ max_size(0), block_size(0),
+ directio(dio), aio(ai), force_aio(faio),
+ must_write_header(false),
+ write_pos(0), read_pos(0),
+ discard(false),
+#ifdef HAVE_LIBAIO
+ aio_lock("FileJournal::aio_lock"),
+ aio_ctx(0),
+ aio_num(0), aio_bytes(0),
+ aio_write_queue_ops(0),
+ aio_write_queue_bytes(0),
+#endif
+ last_committed_seq(0),
+ journaled_since_start(0),
+ full_state(FULL_NOTFULL),
+ fd(-1),
+ writing_seq(0),
+ throttle(cct->_conf->filestore_caller_concurrency),
+ write_lock("FileJournal::write_lock", false, true, false),
+ write_stop(true),
+ aio_stop(true),
+ write_thread(this),
+ write_finish_thread(this),
+ trace_endpoint("0.0.0.0", 0, "FileJournal") {
+
+ if (aio && !directio) {
+ lderr(cct) << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
+ aio = false;
+ }
+#ifndef HAVE_LIBAIO
+ if (aio && ::getenv("CEPH_DEV") == NULL) {
+ lderr(cct) << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl;
+ aio = false;
+ }
+#endif
+
+ cct->_conf.add_observer(this);
+ }
+ ~FileJournal() override {
+ ceph_assert(fd == -1);
+ delete[] zero_buf;
+ cct->_conf.remove_observer(this);
+ }
+
+ int check() override;
+ int create() override;
+ int open(uint64_t fs_op_seq) override;
+ void close() override;
+ int peek_fsid(uuid_d& fsid);
+
+ int dump(ostream& out) override;
+ int simple_dump(ostream& out);
+ int _fdump(Formatter &f, bool simple);
+
+ void flush() override;
+
+ void get_devices(set<string> *ls) override;
+ void collect_metadata(map<string,string> *pm) override;
+
+ void reserve_throttle_and_backoff(uint64_t count) override;
+
+ bool is_writeable() override {
+ return read_pos == 0;
+ }
+ int make_writeable() override;
+
+ // writes
+ void commit_start(uint64_t seq) override;
+ void committed_thru(uint64_t seq) override;
+ bool should_commit_now() override {
+ return full_state != FULL_NOTFULL && !write_stop;
+ }
+
+ void write_header_sync();
+
+ void set_wait_on_full(bool b) { wait_on_full = b; }
+
+ off64_t get_journal_size_estimate();
+
+ // reads
+
+ /// Result code for read_entry
+ enum read_entry_result {
+ SUCCESS,
+ FAILURE,
+ MAYBE_CORRUPT
+ };
+
+ /**
+ * read_entry
+ *
+ * Reads next entry starting at pos. If the entry appears
+ * clean, *bl will contain the payload, *seq will contain
+ * the sequence number, and *out_pos will reflect the next
+ * read position. If the entry is invalid *ss will contain
+ * debug text, while *seq, *out_pos, and *bl will be unchanged.
+ *
+ * If the entry suggests a corrupt log, *ss will contain debug
+ * text, *out_pos will contain the next index to check. If
+ * we find an entry in this way that returns SUCCESS, the journal
+ * is most likely corrupt.
+ */
+ read_entry_result do_read_entry(
+ off64_t pos, ///< [in] position to read
+ off64_t *next_pos, ///< [out] next position to read
+ bufferlist* bl, ///< [out] payload for successful read
+ uint64_t *seq, ///< [out] seq of successful read
+ ostream *ss, ///< [out] error output
+ entry_header_t *h = 0 ///< [out] header
+ ) const; ///< @return result code
+
+ bool read_entry(
+ bufferlist &bl,
+ uint64_t &last_seq,
+ bool *corrupt
+ );
+
+ bool read_entry(
+ bufferlist &bl,
+ uint64_t &last_seq) override {
+ return read_entry(bl, last_seq, 0);
+ }
+
+ // Debug/Testing
+ void get_header(
+ uint64_t wanted_seq,
+ off64_t *_pos,
+ entry_header_t *h);
+ void corrupt(
+ int wfd,
+ off64_t corrupt_at);
+ void corrupt_payload(
+ int wfd,
+ uint64_t seq);
+ void corrupt_footer_magic(
+ int wfd,
+ uint64_t seq);
+ void corrupt_header_magic(
+ int wfd,
+ uint64_t seq);
+};
+
+WRITE_CLASS_ENCODER(FileJournal::header_t)
+
+#endif