1 files changed, 440 insertions, 0 deletions
diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h
new file mode 100644
index 000000000..0ea51d922
--- /dev/null
+++ b/src/rocksdb/db/write_thread.h
@@ -0,0 +1,440 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <condition_variable>
+#include <cstdint>
+#include <mutex>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/post_memtable_callback.h"
+#include "db/pre_release_callback.h"
+#include "db/write_callback.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteThread {
+ public:
+  enum State : uint8_t {
+    // The initial state of a writer.  This is a Writer that is
+    // waiting in JoinBatchGroup.  This state can be left when another
+    // thread informs the waiter that it has become a group leader
+    // (-> STATE_GROUP_LEADER), when a leader that has chosen to be
+    // non-parallel informs a follower that its writes have been committed
+    // (-> STATE_COMPLETED), or when a leader that has chosen to perform
+    // updates in parallel and needs this Writer to apply its batch (->
+    // STATE_PARALLEL_MEMTABLE_WRITER).
+    STATE_INIT = 1,
+
+    // The state used to inform a waiting Writer that it has become the
+    // leader, and it should now build a write batch group.  Tricky:
+    // this state is not used if newest_writer_ is empty when a writer
+    // enqueues itself, because there is no need to wait (or even to
+    // create the mutex and condvar used to wait) in that case.  This is
+    // a terminal state unless the leader chooses to make this a parallel
+    // batch, in which case the last parallel worker to finish will move
+    // the leader to STATE_COMPLETED.
+    STATE_GROUP_LEADER = 2,
+
+    // The state used to inform a waiting writer that it has become the
+    // leader of memtable writer group. The leader will either write
+    // memtable for the whole group, or launch a parallel group write
+    // to memtable by calling LaunchParallelMemTableWrite.
+    STATE_MEMTABLE_WRITER_LEADER = 4,
+
+    // The state used to inform a waiting writer that it has become a
+    // parallel memtable writer. It can be the group leader who launch the
+    // parallel writer group, or one of the followers. The writer should then
+    // apply its batch to the memtable concurrently and call
+    // CompleteParallelMemTableWriter.
+    STATE_PARALLEL_MEMTABLE_WRITER = 8,
+
+    // A follower whose writes have been applied, or a parallel leader
+    // whose followers have all finished their work.  This is a terminal
+    // state.
+    STATE_COMPLETED = 16,
+
+    // A state indicating that the thread may be waiting using StateMutex()
+    // and StateCondVar()
+    STATE_LOCKED_WAITING = 32,
+  };
+
+  struct Writer;
+
+  struct WriteGroup {
+    Writer* leader = nullptr;
+    Writer* last_writer = nullptr;
+    SequenceNumber last_sequence;
+    // before running goes to zero, status needs leader->StateMutex()
+    Status status;
+    std::atomic<size_t> running;
+    size_t size = 0;
+
+    struct Iterator {
+      Writer* writer;
+      Writer* last_writer;
+
+      explicit Iterator(Writer* w, Writer* last)
+          : writer(w), last_writer(last) {}
+
+      Writer* operator*() const { return writer; }
+
+      Iterator& operator++() {
+        assert(writer != nullptr);
+        if (writer == last_writer) {
+          writer = nullptr;
+        } else {
+          writer = writer->link_newer;
+        }
+        return *this;
+      }
+
+      bool operator!=(const Iterator& other) const {
+        return writer != other.writer;
+      }
+    };
+
+    Iterator begin() const { return Iterator(leader, last_writer); }
+    Iterator end() const { return Iterator(nullptr, nullptr); }
+  };
+
+  // Information kept for every waiting writer.
+  struct Writer {
+    WriteBatch* batch;
+    bool sync;
+    bool no_slowdown;
+    bool disable_wal;
+    Env::IOPriority rate_limiter_priority;
+    bool disable_memtable;
+    size_t batch_cnt;  // if non-zero, number of sub-batches in the write batch
+    size_t protection_bytes_per_key;
+    PreReleaseCallback* pre_release_callback;
+    PostMemTableCallback* post_memtable_callback;
+    uint64_t log_used;  // log number that this batch was inserted into
+    uint64_t log_ref;   // log number that memtable insert should reference
+    WriteCallback* callback;
+    bool made_waitable;          // records lazy construction of mutex and cv
+    std::atomic<uint8_t> state;  // write under StateMutex() or pre-link
+    WriteGroup* write_group;
+    SequenceNumber sequence;  // the sequence number to use for the first key
+    Status status;
+    Status callback_status;  // status returned by callback->Callback()
+
+    std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
+    std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
+    Writer* link_older;  // read/write only before linking, or as leader
+    Writer* link_newer;  // lazy, read/write only before linking, or as leader
+
+    Writer()
+        : batch(nullptr),
+          sync(false),
+          no_slowdown(false),
+          disable_wal(false),
+          rate_limiter_priority(Env::IOPriority::IO_TOTAL),
+          disable_memtable(false),
+          batch_cnt(0),
+          protection_bytes_per_key(0),
+          pre_release_callback(nullptr),
+          post_memtable_callback(nullptr),
+          log_used(0),
+          log_ref(0),
+          callback(nullptr),
+          made_waitable(false),
+          state(STATE_INIT),
+          write_group(nullptr),
+          sequence(kMaxSequenceNumber),
+          link_older(nullptr),
+          link_newer(nullptr) {}
+
+    Writer(const WriteOptions& write_options, WriteBatch* _batch,
+           WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
+           size_t _batch_cnt = 0,
+           PreReleaseCallback* _pre_release_callback = nullptr,
+           PostMemTableCallback* _post_memtable_callback = nullptr)
+        : batch(_batch),
+          sync(write_options.sync),
+          no_slowdown(write_options.no_slowdown),
+          disable_wal(write_options.disableWAL),
+          rate_limiter_priority(write_options.rate_limiter_priority),
+          disable_memtable(_disable_memtable),
+          batch_cnt(_batch_cnt),
+          protection_bytes_per_key(_batch->GetProtectionBytesPerKey()),
+          pre_release_callback(_pre_release_callback),
+          post_memtable_callback(_post_memtable_callback),
+          log_used(0),
+          log_ref(_log_ref),
+          callback(_callback),
+          made_waitable(false),
+          state(STATE_INIT),
+          write_group(nullptr),
+          sequence(kMaxSequenceNumber),
+          link_older(nullptr),
+          link_newer(nullptr) {}
+
+    ~Writer() {
+      if (made_waitable) {
+        StateMutex().~mutex();
+        StateCV().~condition_variable();
+      }
+      status.PermitUncheckedError();
+      callback_status.PermitUncheckedError();
+    }
+
+    bool CheckCallback(DB* db) {
+      if (callback != nullptr) {
+        callback_status = callback->Callback(db);
+      }
+      return callback_status.ok();
+    }
+
+    void CreateMutex() {
+      if (!made_waitable) {
+        // Note that made_waitable is tracked separately from state
+        // transitions, because we can't atomically create the mutex and
+        // link into the list.
+        made_waitable = true;
+        new (&state_mutex_bytes) std::mutex;
+        new (&state_cv_bytes) std::condition_variable;
+      }
+    }
+
+    // returns the aggregate status of this Writer
+    Status FinalStatus() {
+      if (!status.ok()) {
+        // a non-ok memtable write status takes presidence
+        assert(callback == nullptr || callback_status.ok());
+        return status;
+      } else if (!callback_status.ok()) {
+        // if the callback failed then that is the status we want
+        // because a memtable insert should not have been attempted
+        assert(callback != nullptr);
+        assert(status.ok());
+        return callback_status;
+      } else {
+        // if there is no callback then we only care about
+        // the memtable insert status
+        assert(callback == nullptr || callback_status.ok());
+        return status;
+      }
+    }
+
+    bool CallbackFailed() {
+      return (callback != nullptr) && !callback_status.ok();
+    }
+
+    bool ShouldWriteToMemtable() {
+      return status.ok() && !CallbackFailed() && !disable_memtable;
+    }
+
+    bool ShouldWriteToWAL() {
+      return status.ok() && !CallbackFailed() && !disable_wal;
+    }
+
+    // No other mutexes may be acquired while holding StateMutex(), it is
+    // always last in the order
+    std::mutex& StateMutex() {
+      assert(made_waitable);
+      return *static_cast<std::mutex*>(static_cast<void*>(&state_mutex_bytes));
+    }
+
+    std::condition_variable& StateCV() {
+      assert(made_waitable);
+      return *static_cast<std::condition_variable*>(
+          static_cast<void*>(&state_cv_bytes));
+    }
+  };
+
+  struct AdaptationContext {
+    const char* name;
+    std::atomic<int32_t> value;
+
+    explicit AdaptationContext(const char* name0) : name(name0), value(0) {}
+  };
+
+  explicit WriteThread(const ImmutableDBOptions& db_options);
+
+  virtual ~WriteThread() = default;
+
+  // IMPORTANT: None of the methods in this class rely on the db mutex
+  // for correctness. All of the methods except JoinBatchGroup and
+  // EnterUnbatched may be called either with or without the db mutex held.
+  // Correctness is maintained by ensuring that only a single thread is
+  // a leader at a time.
+
+  // Registers w as ready to become part of a batch group, waits until the
+  // caller should perform some work, and returns the current state of the
+  // writer.  If w has become the leader of a write batch group, returns
+  // STATE_GROUP_LEADER.  If w has been made part of a sequential batch
+  // group and the leader has performed the write, returns STATE_DONE.
+  // If w has been made part of a parallel batch group and is responsible
+  // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER.
+  //
+  // The db mutex SHOULD NOT be held when calling this function, because
+  // it will block.
+  //
+  // Writer* w:        Writer to be executed as part of a batch group
+  void JoinBatchGroup(Writer* w);
+
+  // Constructs a write batch group led by leader, which should be a
+  // Writer passed to JoinBatchGroup on the current thread.
+  //
+  // Writer* leader:          Writer that is STATE_GROUP_LEADER
+  // WriteGroup* write_group: Out-param of group members
+  // returns:                 Total batch group byte size
+  size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group);
+
+  // Unlinks the Writer-s in a batch group, wakes up the non-leaders,
+  // and wakes up the next leader (if any).
+  //
+  // WriteGroup* write_group: the write group
+  // Status status:           Status of write operation
+  void ExitAsBatchGroupLeader(WriteGroup& write_group, Status& status);
+
+  // Exit batch group on behalf of batch group leader.
+  void ExitAsBatchGroupFollower(Writer* w);
+
+  // Constructs a write batch group led by leader from newest_memtable_writers_
+  // list. The leader should either write memtable for the whole group and
+  // call ExitAsMemTableWriter, or launch parallel memtable write through
+  // LaunchParallelMemTableWriters.
+  void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup);
+
+  // Memtable writer group leader, or the last finished writer in a parallel
+  // write group, exit from the newest_memtable_writers_ list, and wake up
+  // the next leader if needed.
+  void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group);
+
+  // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of
+  // the non-leader members of this write batch group.  Sets Writer::sequence
+  // before waking them up.
+  //
+  // WriteGroup* write_group: Extra state used to coordinate the parallel add
+  void LaunchParallelMemTableWriters(WriteGroup* write_group);
+
+  // Reports the completion of w's batch to the parallel group leader, and
+  // waits for the rest of the parallel batch to complete.  Returns true
+  // if this thread is the last to complete, and hence should advance
+  // the sequence number and then call EarlyExitParallelGroup, false if
+  // someone else has already taken responsibility for that.
+  bool CompleteParallelMemTableWriter(Writer* w);
+
+  // Waits for all preceding writers (unlocking mu while waiting), then
+  // registers w as the currently proceeding writer.
+  //
+  // Writer* w:              A Writer not eligible for batching
+  // InstrumentedMutex* mu:  The db mutex, to unlock while waiting
+  // REQUIRES: db mutex held
+  void EnterUnbatched(Writer* w, InstrumentedMutex* mu);
+
+  // Completes a Writer begun with EnterUnbatched, unblocking subsequent
+  // writers.
+  void ExitUnbatched(Writer* w);
+
+  // Wait for all parallel memtable writers to finish, in case pipelined
+  // write is enabled.
+  void WaitForMemTableWriters();
+
+  SequenceNumber UpdateLastSequence(SequenceNumber sequence) {
+    if (sequence > last_sequence_) {
+      last_sequence_ = sequence;
+    }
+    return last_sequence_;
+  }
+
+  // Insert a dummy writer at the tail of the write queue to indicate a write
+  // stall, and fail any writers in the queue with no_slowdown set to true
+  void BeginWriteStall();
+
+  // Remove the dummy writer and wake up waiting writers
+  void EndWriteStall();
+
+ private:
+  // See AwaitState.
+  const uint64_t max_yield_usec_;
+  const uint64_t slow_yield_usec_;
+
+  // Allow multiple writers write to memtable concurrently.
+  const bool allow_concurrent_memtable_write_;
+
+  // Enable pipelined write to WAL and memtable.
+  const bool enable_pipelined_write_;
+
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  const uint64_t max_write_batch_group_size_bytes;
+
+  // Points to the newest pending writer. Only leader can remove
+  // elements, adding can be done lock-free by anybody.
+  std::atomic<Writer*> newest_writer_;
+
+  // Points to the newest pending memtable writer. Used only when pipelined
+  // write is enabled.
+  std::atomic<Writer*> newest_memtable_writer_;
+
+  // The last sequence that have been consumed by a writer. The sequence
+  // is not necessary visible to reads because the writer can be ongoing.
+  SequenceNumber last_sequence_;
+
+  // A dummy writer to indicate a write stall condition. This will be inserted
+  // at the tail of the writer queue by the leader, so newer writers can just
+  // check for this and bail
+  Writer write_stall_dummy_;
+
+  // Mutex and condvar for writers to block on a write stall. During a write
+  // stall, writers with no_slowdown set to false will wait on this rather
+  // on the writer queue
+  port::Mutex stall_mu_;
+  port::CondVar stall_cv_;
+
+  // Waits for w->state & goal_mask using w->StateMutex().  Returns
+  // the state that satisfies goal_mask.
+  uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask);
+
+  // Blocks until w->state & goal_mask, returning the state value
+  // that satisfied the predicate.  Uses ctx to adaptively use
+  // std::this_thread::yield() to avoid mutex overheads.  ctx should be
+  // a context-dependent static.
+  uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx);
+
+  // Set writer state and wake the writer up if it is waiting.
+  void SetState(Writer* w, uint8_t new_state);
+
+  // Links w into the newest_writer list. Return true if w was linked directly
+  // into the leader position.  Safe to call from multiple threads without
+  // external locking.
+  bool LinkOne(Writer* w, std::atomic<Writer*>* newest_writer);
+
+  // Link write group into the newest_writer list as a whole, while keeping the
+  // order of the writers unchanged. Return true if the group was linked
+  // directly into the leader position.
+  bool LinkGroup(WriteGroup& write_group, std::atomic<Writer*>* newest_writer);
+
+  // Computes any missing link_newer links.  Should not be called
+  // concurrently with itself.
+  void CreateMissingNewerLinks(Writer* head);
+
+  // Set the leader in write_group to completed state and remove it from the
+  // write group.
+  void CompleteLeader(WriteGroup& write_group);
+
+  // Set a follower in write_group to completed state and remove it from the
+  // write group.
+  void CompleteFollower(Writer* w, WriteGroup& write_group);
+};
+
+}  // namespace ROCKSDB_NAMESPACE