1 files changed, 523 insertions, 0 deletions
diff --git a/storage/tokudb/PerconaFT/locktree/locktree.h b/storage/tokudb/PerconaFT/locktree/locktree.h
new file mode 100644
index 00000000..4d9e5bda
--- /dev/null
+++ b/storage/tokudb/PerconaFT/locktree/locktree.h
@@ -0,0 +1,523 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <atomic>
+
+#include <db.h>
+#include <toku_pthread.h>
+#include <toku_time.h>
+
+#include <ft/comparator.h>
+#include <ft/ft-ops.h>  // just for DICTIONARY_ID..
+
+#include <util/omt.h>
+
+#include "txnid_set.h"
+#include "wfg.h"
+#include "range_buffer.h"
+
+
+namespace toku {
+
+    class locktree;
+    class locktree_manager;
+    class lock_request;
+    class concurrent_tree;
+
+    typedef int  (*lt_create_cb)(locktree *lt, void *extra);
+    typedef void (*lt_destroy_cb)(locktree *lt);
+    typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt, const range_buffer &buffer, void *extra);
+
+    struct lt_counters {
+        uint64_t wait_count, wait_time;
+        uint64_t long_wait_count, long_wait_time;
+        uint64_t timeout_count;
+
+        void add(const lt_counters &rhs) {
+            wait_count += rhs.wait_count;
+            wait_time += rhs.wait_time;
+            long_wait_count += rhs.long_wait_count;
+            long_wait_time += rhs.long_wait_time;
+            timeout_count += rhs.timeout_count;
+        }
+    };
+
+    // Lock request state for some locktree
+    struct lt_lock_request_info {
+        omt<lock_request *> pending_lock_requests;
+        std::atomic_bool pending_is_empty;
+        toku_mutex_t mutex;
+        bool should_retry_lock_requests;
+        lt_counters counters;
+        std::atomic_ullong retry_want;
+        unsigned long long retry_done;
+        toku_mutex_t retry_mutex;
+        toku_cond_t retry_cv;
+        bool running_retry;
+
+        void init(void);
+        void destroy(void);
+    };
+
+    // The locktree manager manages a set of locktrees, one for each open
+    // dictionary. Locktrees are retrieved from the manager. When they are no
+    // longer needed, they are be released by the user.
+    class locktree_manager {
+       public:
+        // param: create_cb, called just after a locktree is first created.
+        //        destroy_cb, called just before a locktree is destroyed.
+        //        escalate_cb, called after a locktree is escalated (with extra
+        //        param)
+        void create(lt_create_cb create_cb,
+                    lt_destroy_cb destroy_cb,
+                    lt_escalate_cb escalate_cb,
+                    void *extra);
+
+        void destroy(void);
+
+        size_t get_max_lock_memory(void);
+
+        int set_max_lock_memory(size_t max_lock_memory);
+
+        // effect: Get a locktree from the manager. If a locktree exists with the given
+        //         dict_id, it is referenced and then returned. If one did not exist, it
+        //         is created. It will use the comparator for comparing keys. The on_create
+        //         callback (passed to locktree_manager::create()) will be called with the
+        //         given extra parameter.
+        locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp, void *on_create_extra);
+
+        void reference_lt(locktree *lt);
+
+        // effect: Releases one reference on a locktree. If the reference count transitions
+        //         to zero, the on_destroy callback is called before it gets destroyed.
+        void release_lt(locktree *lt);
+
+        void get_status(LTM_STATUS status);
+
+        // effect: calls the iterate function on each pending lock request
+        // note: holds the manager's mutex
+        typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
+                                                     TXNID txnid,
+                                                     const DBT *left_key,
+                                                     const DBT *right_key,
+                                                     TXNID blocking_txnid,
+                                                     uint64_t start_time,
+                                                     void *extra);
+        int iterate_pending_lock_requests(lock_request_iterate_callback cb, void *extra);
+
+        // effect: Determines if too many locks or too much memory is being used,
+        //         Runs escalation on the manager if so.
+        // param: big_txn, if the current transaction is 'big' (has spilled rollback logs)
+        // returns: 0 if there enough resources to create a new lock, or TOKUDB_OUT_OF_LOCKS 
+        //          if there are not enough resources and lock escalation failed to free up
+        //          enough resources for a new lock.
+        int check_current_lock_constraints(bool big_txn);
+
+        bool over_big_threshold(void);
+
+        void note_mem_used(uint64_t mem_used);
+
+        void note_mem_released(uint64_t mem_freed);
+
+        bool out_of_locks(void) const;
+
+        // Escalate all locktrees
+        void escalate_all_locktrees(void);
+
+        // Escalate a set of locktrees
+        void escalate_locktrees(locktree **locktrees, int num_locktrees);
+
+        // effect: calls the private function run_escalation(), only ok to
+        //         do for tests.
+        // rationale: to get better stress test coverage, we want a way to
+        //            deterministicly trigger lock escalation.
+        void run_escalation_for_test(void);
+        void run_escalation(void);
+
+        // Add time t to the escalator's wait time statistics
+        void add_escalator_wait_time(uint64_t t);
+
+        void kill_waiter(void *extra);
+
+    private:
+        static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
+
+        // tracks the current number of locks and lock memory
+        uint64_t m_max_lock_memory;
+        uint64_t m_current_lock_memory;
+
+        struct lt_counters m_lt_counters;
+
+        // the create and destroy callbacks for the locktrees
+        lt_create_cb m_lt_create_callback;
+        lt_destroy_cb m_lt_destroy_callback;
+        lt_escalate_cb m_lt_escalate_callback;
+        void *m_lt_escalate_callback_extra;
+
+        omt<locktree *> m_locktree_map;
+
+        // the manager's mutex protects the locktree map
+        toku_mutex_t m_mutex;
+
+        void mutex_lock(void);
+
+        void mutex_unlock(void);
+
+        // Manage the set of open locktrees
+        locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
+        void locktree_map_put(locktree *lt);
+        void locktree_map_remove(locktree *lt);
+
+        static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
+
+        void escalator_init(void);
+        void escalator_destroy(void);
+
+        // statistics about lock escalation.
+        toku_mutex_t m_escalation_mutex;
+        uint64_t m_escalation_count;
+        tokutime_t m_escalation_time;
+        uint64_t m_escalation_latest_result;
+        uint64_t m_wait_escalation_count;
+        uint64_t m_wait_escalation_time;
+        uint64_t m_long_wait_escalation_count;
+        uint64_t m_long_wait_escalation_time;
+
+        // the escalator coordinates escalation on a set of locktrees for a bunch of threads
+        class locktree_escalator {
+        public:
+            void create(void);
+            void destroy(void);
+            void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra), void *extra);
+
+        private:
+            toku_mutex_t m_escalator_mutex;
+            toku_cond_t m_escalator_done;
+            bool m_escalator_running;
+        };
+
+        locktree_escalator m_escalator;
+
+        friend class manager_unit_test;
+    };
+
+    // A locktree represents the set of row locks owned by all transactions
+    // over an open dictionary. Read and write ranges are represented as
+    // a left and right key which are compared with the given comparator
+    //
+    // Locktrees are not created and destroyed by the user. Instead, they are
+    // referenced and released using the locktree manager.
+    //
+    // A sample workflow looks like this:
+    // - Create a manager.
+    // - Get a locktree by dictionaroy id from the manager.
+    // - Perform read/write lock acquision on the locktree, add references to
+    //   the locktree using the manager, release locks, release references, etc.
+    // - ...
+    // - Release the final reference to the locktree. It will be destroyed.
+    // - Destroy the manager.
+    class locktree {
+    public:
+        // effect: Creates a locktree
+        void create(locktree_manager *mgr, DICTIONARY_ID dict_id, const comparator &cmp);
+
+        void destroy(void);
+
+        // For thread-safe, external reference counting
+        void add_reference(void);
+
+        // requires: the reference count is > 0
+        // returns: the reference count, after decrementing it by one
+        uint32_t release_reference(void);
+
+        // returns: the current reference count
+        uint32_t get_reference_count(void);
+
+        // effect: Attempts to grant a read lock for the range of keys between [left_key, right_key].
+        // returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the
+        //          given conflicts set with the txnids that hold conflicting locks in the range.
+        //          If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS.
+        // note: Read locks cannot be shared between txnids, as one would expect.
+        //       This is for simplicity since read locks are rare in MySQL.
+        int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn);
+
+        // effect: Attempts to grant a write lock for the range of keys between [left_key, right_key].
+        // returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the
+        //          given conflicts set with the txnids that hold conflicting locks in the range.
+        //          If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS.
+        int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn);
+
+        // effect: populate the conflicts set with the txnids that would preventing
+        //         the given txnid from getting a lock on [left_key, right_key]
+        void get_conflicts(bool is_write_request, TXNID txnid,
+                const DBT *left_key, const DBT *right_key, txnid_set *conflicts);
+
+        // effect: Release all of the lock ranges represented by the range buffer for a txnid.
+        void release_locks(TXNID txnid, const range_buffer *ranges);
+
+        // effect: Runs escalation on this locktree
+        void escalate(lt_escalate_cb after_escalate_callback, void *extra);
+
+        // returns: The userdata associated with this locktree, or null if it has not been set.
+        void *get_userdata(void) const;
+
+        void set_userdata(void *userdata);
+
+        locktree_manager *get_manager(void) const;
+
+        void set_comparator(const comparator &cmp);
+
+        int compare(const locktree *lt) const;
+
+        DICTIONARY_ID get_dict_id() const;
+
+        // Private info struct for storing pending lock request state.
+        // Only to be used by lock requests. We store it here as
+        // something less opaque than usual to strike a tradeoff between
+        // abstraction and code complexity. It is still fairly abstract
+        // since the lock_request object is opaque 
+        struct lt_lock_request_info *get_lock_request_info(void);
+
+    private:
+        locktree_manager *m_mgr;
+        DICTIONARY_ID m_dict_id;
+        uint32_t m_reference_count;
+
+        // Since the memory referenced by this comparator is not owned by the
+        // locktree, the user must guarantee it will outlive the locktree.
+        //
+        // The ydb API accomplishes this by opening an ft_handle in the on_create
+        // callback, which will keep the underlying FT (and its descriptor) in memory
+        // for as long as the handle is open. The ft_handle is stored opaquely in the
+        // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
+        comparator m_cmp;
+
+        concurrent_tree *m_rangetree;
+
+        void *m_userdata;
+        struct lt_lock_request_info m_lock_request_info;
+
+        // The following fields and members prefixed with "sto_" are for
+        // the single txnid optimization, intended to speed up the case
+        // when only one transaction is using the locktree. If we know
+        // the locktree has only one transaction, then acquiring locks
+        // takes O(1) work and releasing all locks takes O(1) work.
+        //
+        // How do we know that the locktree only has a single txnid?
+        // What do we do if it does?
+        //
+        // When a txn with txnid T requests a lock:
+        // - If the tree is empty, the optimization is possible. Set the single
+        // txnid to T, and insert the lock range into the buffer.
+        // - If the tree is not empty, check if the single txnid is T. If so,
+        // append the lock range to the buffer. Otherwise, migrate all of
+        // the locks in the buffer into the rangetree on behalf of txnid T,
+        // and invalid the single txnid.
+        //
+        // When a txn with txnid T releases its locks:
+        // - If the single txnid is valid, it must be for T. Destroy the buffer.
+        // - If it's not valid, release locks the normal way in the rangetree.
+        //
+        // To carry out the optimization we need to record a single txnid
+        // and a range buffer for each locktree, each protected by the root
+        // lock of the locktree's rangetree. The root lock for a rangetree
+        // is grabbed by preparing a locked keyrange on the rangetree.
+        TXNID m_sto_txnid;
+        range_buffer m_sto_buffer;
+
+        // The single txnid optimization speeds up the case when only one
+        // transaction is using the locktree. But it has the potential to
+        // hurt the case when more than one txnid exists.
+        //
+        // There are two things we need to do to make the optimization only
+        // optimize the case we care about, and not hurt the general case.
+        //
+        // Bound the worst-case latency for lock migration when the
+        // optimization stops working:
+        // - Idea: Stop the optimization and migrate immediate if we notice
+        // the single txnid has takes many locks in the range buffer.
+        // - Implementation: Enforce a max size on the single txnid range buffer.
+        // - Analysis: Choosing the perfect max value, M, is difficult to do
+        // without some feedback from the field. Intuition tells us that M should
+        // not be so small that the optimization is worthless, and it should not
+        // be so big that it's unreasonable to have to wait behind a thread doing
+        // the work of converting M buffer locks into rangetree locks.
+        //
+        // Prevent concurrent-transaction workloads from trying the optimization
+        // in vain:
+        // - Idea: Don't even bother trying the optimization if we think the
+        // system is in a concurrent-transaction state.
+        // - Implementation: Do something even simpler than detecting whether the
+        // system is in a concurent-transaction state. Just keep a "score" value
+        // and some threshold. If at any time the locktree is eligible for the
+        // optimization, only do it if the score is at this threshold. When you
+        // actually do the optimization but someone has to migrate locks in the buffer
+        // (expensive), then reset the score back to zero. Each time a txn
+        // releases locks, the score is incremented by 1.
+        // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
+        // do the optimization in a concurrent-transaction system. Similarly, it
+        // takes at most C txns to start using the single txnid optimzation, which
+        // is good when the system transitions from multithreaded to single threaded.
+        //
+        // STO_BUFFER_MAX_SIZE:
+        //
+        // We choose the max value to be 1 million since most transactions are smaller
+        // than 1 million and we can create a rangetree of 1 million elements in
+        // less than a second. So we can be pretty confident that this threshold
+        // enables the optimization almost always, and prevents super pathological
+        // latency issues for the first lock taken by a second thread.
+        //
+        // STO_SCORE_THRESHOLD:
+        //
+        // A simple first guess at a good value for the score threshold is 100.
+        // By our analysis, we'd end up doing the optimization in vain for
+        // around 1% of all transactions, which seems reasonable. Further,
+        // if the system goes single threaded, it ought to be pretty quick
+        // for 100 transactions to go by, so we won't have to wait long before
+        // we start doing the single txind optimzation again.
+        static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
+        static const int STO_SCORE_THRESHOLD = 100;
+        int m_sto_score;
+
+        // statistics about time spent ending the STO early
+        uint64_t m_sto_end_early_count;
+        tokutime_t m_sto_end_early_time;
+
+        // effect: begins the single txnid optimizaiton, setting m_sto_txnid
+        //         to the given txnid.
+        // requires: m_sto_txnid is invalid
+        void sto_begin(TXNID txnid);
+
+        // effect: append a range to the sto buffer
+        // requires: m_sto_txnid is valid
+        void sto_append(const DBT *left_key, const DBT *right_key);
+
+        // effect: ends the single txnid optimization, releaseing any memory
+        //         stored in the sto buffer, notifying the tracker, and
+        //         invalidating m_sto_txnid.
+        // requires: m_sto_txnid is valid
+        void sto_end(void);
+
+        // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
+        // effect: ends the single txnid optimization early, migrating buffer locks
+        //         into the rangetree, calling sto_end(), and then setting the
+        //         sto_score back to zero.
+        // requires: m_sto_txnid is valid
+        void sto_end_early(void *prepared_lkr);
+        void sto_end_early_no_accounting(void *prepared_lkr);
+
+        // params: prepared_lkr is a void * to a prepared locked keyrange. we can't use
+        //         the real type because the compiler won't allow us to forward declare
+        //         concurrent_tree::locked_keyrange without including concurrent_tree.h,
+        //         which we cannot do here because it is a template implementation.
+        // requires: the prepared locked keyrange is for the locktree's rangetree
+        // requires: m_sto_txnid is valid
+        // effect: migrates each lock in the single txnid buffer into the locktree's
+        //         rangetree, notifying the memory tracker as necessary.
+        void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
+
+        // effect: If m_sto_txnid is valid, then release the txnid's locks
+        //         by ending the optimization.
+        // requires: If m_sto_txnid is valid, it is equal to the given txnid
+        // returns: True if locks were released for this txnid
+        bool sto_try_release(TXNID txnid);
+
+        // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
+        // requires: the prepared locked keyrange is for the locktree's rangetree
+        // effect: If m_sto_txnid is valid and equal to the given txnid, then
+        // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
+        //        but not equal to this txnid, then migrate the buffer's locks
+        //        into the rangetree and end the optimization, setting the score
+        //        back to zero.
+        // returns: true if the lock was acquired for this txnid
+        bool sto_try_acquire(void *prepared_lkr, TXNID txnid,
+                const DBT *left_key, const DBT *right_key);
+
+        // Effect:
+        //  Provides a hook for a helgrind suppression.
+        // Returns:
+        //  true if m_sto_txnid is not TXNID_NONE
+        bool sto_txnid_is_valid_unsafe(void) const;
+
+        // Effect:
+        //  Provides a hook for a helgrind suppression.
+        // Returns:
+        //  m_sto_score
+        int sto_get_score_unsafe(void )const;
+
+        void remove_overlapping_locks_for_txnid(TXNID txnid,
+                                                const DBT *left_key, const DBT *right_key);
+
+        int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+                                      const DBT *left_key, const DBT *right_key,
+                                      txnid_set *conflicts);
+
+        int acquire_lock(bool is_write_request, TXNID txnid,
+                         const DBT *left_key, const DBT *right_key,
+                         txnid_set *conflicts);
+
+        int try_acquire_lock(bool is_write_request, TXNID txnid,
+                             const DBT *left_key, const DBT *right_key,
+                             txnid_set *conflicts, bool big_txn);
+
+
+        friend class locktree_unit_test;
+        friend class manager_unit_test;
+        friend class lock_request_unit_test;
+
+        // engine status reaches into the locktree to read some stats
+        friend void locktree_manager::get_status(LTM_STATUS status);
+    };
+
+} /* namespace toku */