From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 21 Apr 2024 13:54:28 +0200
Subject: Adding upstream version 18.2.2.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 .../range_tree/lib/locktree/concurrent_tree.cc     |  139 +++
 .../range_tree/lib/locktree/concurrent_tree.h      |  174 ++++
 .../lock/range/range_tree/lib/locktree/keyrange.cc |  222 +++++
 .../lock/range/range_tree/lib/locktree/keyrange.h  |  141 +++
 .../range/range_tree/lib/locktree/lock_request.cc  |  527 ++++++++++
 .../range/range_tree/lib/locktree/lock_request.h   |  255 +++++
 .../lock/range/range_tree/lib/locktree/locktree.cc | 1023 ++++++++++++++++++++
 .../lock/range/range_tree/lib/locktree/locktree.h  |  580 +++++++++++
 .../lock/range/range_tree/lib/locktree/manager.cc  |  527 ++++++++++
 .../range/range_tree/lib/locktree/range_buffer.cc  |  265 +++++
 .../range/range_tree/lib/locktree/range_buffer.h   |  178 ++++
 .../lock/range/range_tree/lib/locktree/treenode.cc |  520 ++++++++++
 .../lock/range/range_tree/lib/locktree/treenode.h  |  302 ++++++
 .../range/range_tree/lib/locktree/txnid_set.cc     |  120 +++
 .../lock/range/range_tree/lib/locktree/txnid_set.h |   92 ++
 .../lock/range/range_tree/lib/locktree/wfg.cc      |  213 ++++
 .../lock/range/range_tree/lib/locktree/wfg.h       |  124 +++
 17 files changed, 5402 insertions(+)
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
 create mode 100644 src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h

(limited to 'src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree')

diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
new file mode 100644
index 000000000..5110cd482
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
@@ -0,0 +1,139 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "concurrent_tree.h"
+
+// PORT #include <toku_assert.h>
+namespace toku {
+
+void concurrent_tree::create(const comparator *cmp) {
+  // start with an empty root node. we do this instead of
+  // setting m_root to null so there's always a root to lock
+  m_root.create_root(cmp);
+}
+
+void concurrent_tree::destroy(void) { m_root.destroy_root(); }
+
+bool concurrent_tree::is_empty(void) { return m_root.is_empty(); }
+
+uint64_t concurrent_tree::get_insertion_memory_overhead(void) {
+  return sizeof(treenode);
+}
+
+void concurrent_tree::locked_keyrange::prepare(concurrent_tree *tree) {
+  // the first step in acquiring a locked keyrange is locking the root
+  treenode *const root = &tree->m_root;
+  m_tree = tree;
+  m_subtree = root;
+  m_range = keyrange::get_infinite_range();
+  root->mutex_lock();
+}
+
+void concurrent_tree::locked_keyrange::acquire(const keyrange &range) {
+  treenode *const root = &m_tree->m_root;
+
+  treenode *subtree;
+  if (root->is_empty() || root->range_overlaps(range)) {
+    subtree = root;
+  } else {
+    // we do not have a precomputed comparison hint, so pass null
+    const keyrange::comparison *cmp_hint = nullptr;
+    subtree = root->find_node_with_overlapping_child(range, cmp_hint);
+  }
+
+  // subtree is locked. it will be unlocked when this is release()'d
+  invariant_notnull(subtree);
+  m_range = range;
+  m_subtree = subtree;
+}
+
+bool concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range,
+                                                        TXNID new_owner) {
+  return m_subtree->insert(range, new_owner, /*is_shared*/ true);
+}
+
+void concurrent_tree::locked_keyrange::release(void) {
+  m_subtree->mutex_unlock();
+}
+
+void concurrent_tree::locked_keyrange::insert(const keyrange &range,
+                                              TXNID txnid, bool is_shared) {
+  // empty means no children, and only the root should ever be empty
+  if (m_subtree->is_empty()) {
+    m_subtree->set_range_and_txnid(range, txnid, is_shared);
+  } else {
+    m_subtree->insert(range, txnid, is_shared);
+  }
+}
+
+void concurrent_tree::locked_keyrange::remove(const keyrange &range,
+                                              TXNID txnid) {
+  invariant(!m_subtree->is_empty());
+  treenode *new_subtree = m_subtree->remove(range, txnid);
+  // if removing range changed the root of the subtree,
+  // then the subtree must be the root of the entire tree.
+  if (new_subtree == nullptr) {
+    invariant(m_subtree->is_root());
+    invariant(m_subtree->is_empty());
+  }
+}
+
+void concurrent_tree::locked_keyrange::remove_all(void) {
+  m_subtree->recursive_remove();
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
new file mode 100644
index 000000000..e1bfb86c5
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
@@ -0,0 +1,174 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+#include "keyrange.h"
+#include "treenode.h"
+
+namespace toku {
+
+// A concurrent_tree stores non-overlapping ranges.
+// Access to disjoint parts of the tree usually occurs concurrently.
+
+class concurrent_tree {
+ public:
+  // A locked_keyrange gives you exclusive access to read and write
+  // operations that occur on any keys in that range. You only have
+  // the right to operate on keys in that range or keys that were read
+  // from the keyrange using iterate()
+  //
+  // Access model:
+  // - user prepares a locked keyrange. all threads serialize behind prepare().
+  // - user breaks the serialzation point by acquiring a range, or releasing.
+  // - one thread operates on a certain locked_keyrange object at a time.
+  // - when the thread is finished, it releases
+
+  class locked_keyrange {
+   public:
+    // effect: prepare to acquire a locked keyrange over the given
+    //         concurrent_tree, preventing other threads from preparing
+    //         until this thread either does acquire() or release().
+    // note: operations performed on a prepared keyrange are equivalent
+    //         to ones performed on an acquired keyrange over -inf, +inf.
+    // rationale: this provides the user with a serialization point for
+    // descending
+    //            or modifying the the tree. it also proives a convenient way of
+    //            doing serializable operations on the tree.
+    // There are two valid sequences of calls:
+    //  - prepare, acquire, [operations], release
+    //  - prepare, [operations],release
+    void prepare(concurrent_tree *tree);
+
+    // requires: the locked keyrange was prepare()'d
+    // effect: acquire a locked keyrange over the given concurrent_tree.
+    //         the locked keyrange represents the range of keys overlapped
+    //         by the given range
+    void acquire(const keyrange &range);
+
+    // effect: releases a locked keyrange and the mutex it holds
+    void release(void);
+
+    // effect: iterate over each range this locked_keyrange represents,
+    //         calling function->fn() on each node's keyrange and txnid
+    //         until there are no more or the function returns false
+    template <class F>
+    void iterate(F *function) const {
+      // if the subtree is non-empty, traverse it by calling the given
+      // function on each range, txnid pair found that overlaps.
+      if (!m_subtree->is_empty()) {
+        m_subtree->traverse_overlaps(m_range, function);
+      }
+    }
+
+    // Adds another owner to the lock on the specified keyrange.
+    // requires: the keyrange contains one treenode whose bounds are
+    //           exactly equal to the specifed range (no sub/supersets)
+    bool add_shared_owner(const keyrange &range, TXNID new_owner);
+
+    // inserts the given range into the tree, with an associated txnid.
+    // requires: range does not overlap with anything in this locked_keyrange
+    // rationale: caller is responsible for only inserting unique ranges
+    void insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+    // effect: removes the given range from the tree.
+    //         - txnid=TXNID_ANY means remove the range no matter what its
+    //           owners are
+    //         - Other value means remove the specified txnid from
+    //           ownership (if the range has other owners, it will remain
+    //           in the tree)
+    // requires: range exists exactly in this locked_keyrange
+    // rationale: caller is responsible for only removing existing ranges
+    void remove(const keyrange &range, TXNID txnid);
+
+    // effect: removes all of the keys represented by this locked keyrange
+    // rationale: we'd like a fast way to empty out a tree
+    void remove_all(void);
+
+   private:
+    // the concurrent tree this locked keyrange is for
+    concurrent_tree *m_tree;
+
+    // the range of keys this locked keyrange represents
+    keyrange m_range;
+
+    // the subtree under which all overlapping ranges exist
+    treenode *m_subtree;
+
+    friend class concurrent_tree_unit_test;
+  };
+
+  // effect: initialize the tree to an empty state
+  void create(const comparator *cmp);
+
+  // effect: destroy the tree.
+  // requires: tree is empty
+  void destroy(void);
+
+  // returns: true iff the tree is empty
+  bool is_empty(void);
+
+  // returns: the memory overhead of a single insertion into the tree
+  static uint64_t get_insertion_memory_overhead(void);
+
+ private:
+  // the root needs to always exist so there's a lock to grab
+  // even if the tree is empty. that's why we store a treenode
+  // here and not a pointer to one.
+  treenode m_root;
+
+  friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
new file mode 100644
index 000000000..e50ace5a9
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
@@ -0,0 +1,222 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "keyrange.h"
+
+#include "../util/dbt.h"
+
+namespace toku {
+
+// create a keyrange by borrowing the left and right dbt
+// pointers. no memory is copied. no checks for infinity needed.
+void keyrange::create(const DBT *left, const DBT *right) {
+  init_empty();
+  m_left_key = left;
+  m_right_key = right;
+}
+
+// destroy the key copies. if they were never set, then destroy does nothing.
+void keyrange::destroy(void) {
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+}
+
+// create a keyrange by copying the keys from the given range.
+void keyrange::create_copy(const keyrange &range) {
+  // start with an initialized, empty range
+  init_empty();
+
+  // optimize the case where the left and right keys are the same.
+  // we'd like to only have one copy of the data.
+  if (toku_dbt_equals(range.get_left_key(), range.get_right_key())) {
+    set_both_keys(range.get_left_key());
+  } else {
+    // replace our empty left and right keys with
+    // copies of the range's left and right keys
+    replace_left_key(range.get_left_key());
+    replace_right_key(range.get_right_key());
+  }
+}
+
+// extend this keyrange by choosing the leftmost and rightmost
+// endpoints between this range and the given. replaced keys
+// in this range are freed and inherited keys are copied.
+void keyrange::extend(const comparator &cmp, const keyrange &range) {
+  const DBT *range_left = range.get_left_key();
+  const DBT *range_right = range.get_right_key();
+  if (cmp(range_left, get_left_key()) < 0) {
+    replace_left_key(range_left);
+  }
+  if (cmp(range_right, get_right_key()) > 0) {
+    replace_right_key(range_right);
+  }
+}
+
+// how much memory does this keyrange take?
+// - the size of the left and right keys
+// --- ignore the fact that we may have optimized the point case.
+//     it complicates things for little gain.
+// - the size of the keyrange class itself
+uint64_t keyrange::get_memory_size(void) const {
+  const DBT *left_key = get_left_key();
+  const DBT *right_key = get_right_key();
+  return left_key->size + right_key->size + sizeof(keyrange);
+}
+
+// compare ranges.
+keyrange::comparison keyrange::compare(const comparator &cmp,
+                                       const keyrange &range) const {
+  if (cmp(get_right_key(), range.get_left_key()) < 0) {
+    return comparison::LESS_THAN;
+  } else if (cmp(get_left_key(), range.get_right_key()) > 0) {
+    return comparison::GREATER_THAN;
+  } else if (cmp(get_left_key(), range.get_left_key()) == 0 &&
+             cmp(get_right_key(), range.get_right_key()) == 0) {
+    return comparison::EQUALS;
+  } else {
+    return comparison::OVERLAPS;
+  }
+}
+
+bool keyrange::overlaps(const comparator &cmp, const keyrange &range) const {
+  // equality is a stronger form of overlapping.
+  // so two ranges "overlap" if they're either equal or just overlapping.
+  comparison c = compare(cmp, range);
+  return c == comparison::EQUALS || c == comparison::OVERLAPS;
+}
+
+keyrange keyrange::get_infinite_range(void) {
+  keyrange range;
+  range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+  return range;
+}
+
+void keyrange::init_empty(void) {
+  m_left_key = nullptr;
+  m_right_key = nullptr;
+  toku_init_dbt(&m_left_key_copy);
+  toku_init_dbt(&m_right_key_copy);
+  m_point_range = false;
+}
+
+const DBT *keyrange::get_left_key(void) const {
+  if (m_left_key) {
+    return m_left_key;
+  } else {
+    return &m_left_key_copy;
+  }
+}
+
+const DBT *keyrange::get_right_key(void) const {
+  if (m_right_key) {
+    return m_right_key;
+  } else {
+    return &m_right_key_copy;
+  }
+}
+
+// copy the given once and set both the left and right pointers.
+// optimization for point ranges, so the left and right ranges
+// are not copied twice.
+void keyrange::set_both_keys(const DBT *key) {
+  if (toku_dbt_is_infinite(key)) {
+    m_left_key = key;
+    m_right_key = key;
+  } else {
+    toku_clone_dbt(&m_left_key_copy, *key);
+    toku_copyref_dbt(&m_right_key_copy, m_left_key_copy);
+  }
+  m_point_range = true;
+}
+
+// destroy the current left key. set and possibly copy the new one
+void keyrange::replace_left_key(const DBT *key) {
+  // a little magic:
+  //
+  // if this is a point range, then the left and right keys share
+  // one copy of the data, and it lives in the left key copy. so
+  // if we're replacing the left key, move the real data to the
+  // right key copy instead of destroying it. now, the memory is
+  // owned by the right key and the left key may be replaced.
+  if (m_point_range) {
+    m_right_key_copy = m_left_key_copy;
+  } else {
+    toku_destroy_dbt(&m_left_key_copy);
+  }
+
+  if (toku_dbt_is_infinite(key)) {
+    m_left_key = key;
+  } else {
+    toku_clone_dbt(&m_left_key_copy, *key);
+    m_left_key = nullptr;
+  }
+  m_point_range = false;
+}
+
+// destroy the current right key. set and possibly copy the new one
+void keyrange::replace_right_key(const DBT *key) {
+  toku_destroy_dbt(&m_right_key_copy);
+  if (toku_dbt_is_infinite(key)) {
+    m_right_key = key;
+  } else {
+    toku_clone_dbt(&m_right_key_copy, *key);
+    m_right_key = nullptr;
+  }
+  m_point_range = false;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
new file mode 100644
index 000000000..f9aeea0c4
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
@@ -0,0 +1,141 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+
+namespace toku {
+
+// A keyrange has a left and right key as endpoints.
+//
+// When a keyrange is created it owns no memory, but when it copies
+// or extends another keyrange, it copies memory as necessary. This
+// means it is cheap in the common case.
+
+class keyrange {
+ public:
+  // effect: constructor that borrows left and right key pointers.
+  //         no memory is allocated or copied.
+  void create(const DBT *left_key, const DBT *right_key);
+
+  // effect: constructor that allocates and copies another keyrange's points.
+  void create_copy(const keyrange &range);
+
+  // effect: destroys the keyrange, freeing any allocated memory
+  void destroy(void);
+
+  // effect: extends the keyrange by choosing the leftmost and rightmost
+  //         endpoints from this range and the given range.
+  //         replaced keys in this range are freed, new keys are copied.
+  void extend(const comparator &cmp, const keyrange &range);
+
+  // returns: the amount of memory this keyrange takes. does not account
+  //          for point optimizations or malloc overhead.
+  uint64_t get_memory_size(void) const;
+
+  // returns: pointer to the left key of this range
+  const DBT *get_left_key(void) const;
+
+  // returns: pointer to the right key of this range
+  const DBT *get_right_key(void) const;
+
+  // two ranges are either equal, lt, gt, or overlapping
+  enum comparison { EQUALS, LESS_THAN, GREATER_THAN, OVERLAPS };
+
+  // effect: compares this range to the given range
+  // returns: LESS_THAN    if given range is strictly to the left
+  //          GREATER_THAN if given range is strictly to the right
+  //          EQUALS       if given range has the same left and right endpoints
+  //          OVERLAPS     if at least one of the given range's endpoints falls
+  //                       between this range's endpoints
+  comparison compare(const comparator &cmp, const keyrange &range) const;
+
+  // returns: true if the range and the given range are equal or overlapping
+  bool overlaps(const comparator &cmp, const keyrange &range) const;
+
+  // returns: a keyrange representing -inf, +inf
+  static keyrange get_infinite_range(void);
+
+ private:
+  // some keys should be copied, some keys should not be.
+  //
+  // to support both, we use two DBTs for copies and two pointers
+  // for temporaries. the access rule is:
+  // - if a pointer is non-null, then it reprsents the key.
+  // - otherwise the pointer is null, and the key is in the copy.
+  DBT m_left_key_copy;
+  DBT m_right_key_copy;
+  const DBT *m_left_key;
+  const DBT *m_right_key;
+
+  // if this range is a point range, then m_left_key == m_right_key
+  // and the actual data is stored exactly once in m_left_key_copy.
+  bool m_point_range;
+
+  // effect: initializes a keyrange to be empty
+  void init_empty(void);
+
+  // effect: copies the given key once into the left key copy
+  //         and sets the right key copy to share the left.
+  // rationale: optimization for point ranges to only do one malloc
+  void set_both_keys(const DBT *key);
+
+  // effect: destroys the current left key. sets and copies the new one.
+  void replace_left_key(const DBT *key);
+
+  // effect: destroys the current right key. sets and copies the new one.
+  void replace_right_key(const DBT *key);
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
new file mode 100644
index 000000000..3d217be70
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
@@ -0,0 +1,527 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "lock_request.h"
+
+#include "../portability/toku_race_tools.h"
+#include "../portability/txn_subst.h"
+#include "../util/dbt.h"
+#include "locktree.h"
+
+namespace toku {
+
+// initialize a lock request's internals
+void lock_request::create(toku_external_mutex_factory_t mutex_factory) {
+  m_txnid = TXNID_NONE;
+  m_conflicting_txnid = TXNID_NONE;
+  m_start_time = 0;
+  m_left_key = nullptr;
+  m_right_key = nullptr;
+  toku_init_dbt(&m_left_key_copy);
+  toku_init_dbt(&m_right_key_copy);
+
+  m_type = type::UNKNOWN;
+  m_lt = nullptr;
+
+  m_complete_r = 0;
+  m_state = state::UNINITIALIZED;
+  m_info = nullptr;
+
+  // psergey-todo: this condition is for interruptible wait
+  // note: moved to here from lock_request::create:
+  toku_external_cond_init(mutex_factory, &m_wait_cond);
+
+  m_start_test_callback = nullptr;
+  m_start_before_pending_test_callback = nullptr;
+  m_retry_test_callback = nullptr;
+}
+
+// destroy a lock request.
+void lock_request::destroy(void) {
+  invariant(m_state != state::PENDING);
+  invariant(m_state != state::DESTROYED);
+  m_state = state::DESTROYED;
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+  toku_external_cond_destroy(&m_wait_cond);
+}
+
+// set the lock request parameters. this API allows a lock request to be reused.
+void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, lock_request::type lock_type,
+                       bool big_txn, void *extra) {
+  invariant(m_state != state::PENDING);
+  m_lt = lt;
+
+  m_txnid = txnid;
+  m_left_key = left_key;
+  m_right_key = right_key;
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+  m_type = lock_type;
+  m_state = state::INITIALIZED;
+  m_info = lt ? lt->get_lock_request_info() : nullptr;
+  m_big_txn = big_txn;
+  m_extra = extra;
+}
+
+// get rid of any stored left and right key copies and
+// replace them with copies of the given left and right key
+void lock_request::copy_keys() {
+  if (!toku_dbt_is_infinite(m_left_key)) {
+    toku_clone_dbt(&m_left_key_copy, *m_left_key);
+    m_left_key = &m_left_key_copy;
+  }
+  if (!toku_dbt_is_infinite(m_right_key)) {
+    toku_clone_dbt(&m_right_key_copy, *m_right_key);
+    m_right_key = &m_right_key_copy;
+  }
+}
+
+// what are the conflicts for this pending lock request?
+void lock_request::get_conflicts(txnid_set *conflicts) {
+  invariant(m_state == state::PENDING);
+  const bool is_write_request = m_type == type::WRITE;
+  m_lt->get_conflicts(is_write_request, m_txnid, m_left_key, m_right_key,
+                      conflicts);
+}
+
+// build a wait-for-graph for this lock request and the given conflict set
+// for each transaction B that blocks A's lock request
+//     if B is blocked then
+//         add (A,T) to the WFG and if B is new, fill in the WFG from B
+void lock_request::build_wait_graph(wfg *wait_graph,
+                                    const txnid_set &conflicts) {
+  uint32_t num_conflicts = conflicts.size();
+  for (uint32_t i = 0; i < num_conflicts; i++) {
+    TXNID conflicting_txnid = conflicts.get(i);
+    lock_request *conflicting_request = find_lock_request(conflicting_txnid);
+    invariant(conflicting_txnid != m_txnid);
+    invariant(conflicting_request != this);
+    if (conflicting_request) {
+      bool already_exists = wait_graph->node_exists(conflicting_txnid);
+      wait_graph->add_edge(m_txnid, conflicting_txnid);
+      if (!already_exists) {
+        // recursively build the wait for graph rooted at the conflicting
+        // request, given its set of lock conflicts.
+        txnid_set other_conflicts;
+        other_conflicts.create();
+        conflicting_request->get_conflicts(&other_conflicts);
+        conflicting_request->build_wait_graph(wait_graph, other_conflicts);
+        other_conflicts.destroy();
+      }
+    }
+  }
+}
+
+// returns: true if the current set of lock requests contains
+//          a deadlock, false otherwise.
+bool lock_request::deadlock_exists(const txnid_set &conflicts) {
+  wfg wait_graph;
+  wait_graph.create();
+
+  build_wait_graph(&wait_graph, conflicts);
+
+  std::function<void(TXNID)> reporter;
+  if (m_deadlock_cb) {
+    reporter = [this](TXNID a) {
+      lock_request *req = find_lock_request(a);
+      if (req) {
+        m_deadlock_cb(req->m_txnid, (req->m_type == lock_request::WRITE),
+                      req->m_left_key, req->m_right_key);
+      }
+    };
+  }
+
+  bool deadlock = wait_graph.cycle_exists_from_txnid(m_txnid, reporter);
+  wait_graph.destroy();
+  return deadlock;
+}
+
+// try to acquire a lock described by this lock request.
+int lock_request::start(void) {
+  int r;
+
+  txnid_set conflicts;
+  conflicts.create();
+  if (m_type == type::WRITE) {
+    r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                 m_big_txn);
+  } else {
+    invariant(m_type == type::READ);
+    r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                m_big_txn);
+  }
+
+  // if the lock is not granted, save it to the set of lock requests
+  // and check for a deadlock. if there is one, complete it as failed
+  if (r == DB_LOCK_NOTGRANTED) {
+    copy_keys();
+    m_state = state::PENDING;
+    m_start_time = toku_current_time_microsec() / 1000;
+    m_conflicting_txnid = conflicts.get(0);
+    if (m_start_before_pending_test_callback)
+      m_start_before_pending_test_callback();
+    toku_external_mutex_lock(&m_info->mutex);
+    insert_into_lock_requests();
+    if (deadlock_exists(conflicts)) {
+      remove_from_lock_requests();
+      r = DB_LOCK_DEADLOCK;
+    }
+    toku_external_mutex_unlock(&m_info->mutex);
+    if (m_start_test_callback) m_start_test_callback();  // test callback
+  }
+
+  if (r != DB_LOCK_NOTGRANTED) {
+    complete(r);
+  }
+
+  conflicts.destroy();
+  return r;
+}
+
+// sleep on the lock request until it becomes resolved or the wait time has
+// elapsed.
+int lock_request::wait(uint64_t wait_time_ms) {
+  return wait(wait_time_ms, 0, nullptr);
+}
+
+int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+                       int (*killed_callback)(void),
+                       void (*lock_wait_callback)(void *, lock_wait_infos *),
+                       void *callback_arg) {
+  uint64_t t_now = toku_current_time_microsec();
+  uint64_t t_start = t_now;
+  uint64_t t_end = t_start + wait_time_ms * 1000;
+
+  toku_external_mutex_lock(&m_info->mutex);
+
+  // check again, this time locking out other retry calls
+  if (m_state == state::PENDING) {
+    lock_wait_infos conflicts_collector;
+    retry(&conflicts_collector);
+    if (m_state == state::PENDING) {
+      report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+    }
+  }
+
+  while (m_state == state::PENDING) {
+    // check if this thread is killed
+    if (killed_callback && killed_callback()) {
+      remove_from_lock_requests();
+      complete(DB_LOCK_NOTGRANTED);
+      continue;
+    }
+
+    // compute the time until we should wait
+    uint64_t t_wait;
+    if (killed_time_ms == 0) {
+      t_wait = t_end;
+    } else {
+      t_wait = t_now + killed_time_ms * 1000;
+      if (t_wait > t_end) t_wait = t_end;
+    }
+
+    int r = toku_external_cond_timedwait(&m_wait_cond, &m_info->mutex,
+                                         (int64_t)(t_wait - t_now));
+    invariant(r == 0 || r == ETIMEDOUT);
+
+    t_now = toku_current_time_microsec();
+    if (m_state == state::PENDING && (t_now >= t_end)) {
+      m_info->counters.timeout_count += 1;
+
+      // if we're still pending and we timed out, then remove our
+      // request from the set of lock requests and fail.
+      remove_from_lock_requests();
+
+      // complete sets m_state to COMPLETE, breaking us out of the loop
+      complete(DB_LOCK_NOTGRANTED);
+    }
+  }
+
+  uint64_t t_real_end = toku_current_time_microsec();
+  uint64_t duration = t_real_end - t_start;
+  m_info->counters.wait_count += 1;
+  m_info->counters.wait_time += duration;
+  if (duration >= 1000000) {
+    m_info->counters.long_wait_count += 1;
+    m_info->counters.long_wait_time += duration;
+  }
+  toku_external_mutex_unlock(&m_info->mutex);
+
+  invariant(m_state == state::COMPLETE);
+  return m_complete_r;
+}
+
+// complete this lock request with the given return value
+void lock_request::complete(int complete_r) {
+  m_complete_r = complete_r;
+  m_state = state::COMPLETE;
+}
+
+const DBT *lock_request::get_left_key(void) const { return m_left_key; }
+
+const DBT *lock_request::get_right_key(void) const { return m_right_key; }
+
+TXNID lock_request::get_txnid(void) const { return m_txnid; }
+
+uint64_t lock_request::get_start_time(void) const { return m_start_time; }
+
+TXNID lock_request::get_conflicting_txnid(void) const {
+  return m_conflicting_txnid;
+}
+
+int lock_request::retry(lock_wait_infos *conflicts_collector) {
+  invariant(m_state == state::PENDING);
+  int r;
+  txnid_set conflicts;
+  conflicts.create();
+
+  if (m_type == type::WRITE) {
+    r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                 m_big_txn);
+  } else {
+    r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                m_big_txn);
+  }
+
+  // if the acquisition succeeded then remove ourselves from the
+  // set of lock requests, complete, and signal the waiting thread.
+  if (r == 0) {
+    remove_from_lock_requests();
+    complete(r);
+    if (m_retry_test_callback) m_retry_test_callback();  // test callback
+    toku_external_cond_broadcast(&m_wait_cond);
+  } else {
+    m_conflicting_txnid = conflicts.get(0);
+    add_conflicts_to_waits(&conflicts, conflicts_collector);
+  }
+  conflicts.destroy();
+
+  return r;
+}
+
+void lock_request::retry_all_lock_requests(
+    locktree *lt, void (*lock_wait_callback)(void *, lock_wait_infos *),
+    void *callback_arg, void (*after_retry_all_test_callback)(void)) {
+  lt_lock_request_info *info = lt->get_lock_request_info();
+
+  // if there are no pending lock requests than there is nothing to do
+  // the unlocked data race on pending_is_empty is OK since lock requests
+  // are retried after added to the pending set.
+  if (info->pending_is_empty) return;
+
+  // get my retry generation (post increment of retry_want)
+  unsigned long long my_retry_want = (info->retry_want += 1);
+
+  toku_mutex_lock(&info->retry_mutex);
+
+  // here is the group retry algorithm.
+  // get the latest retry_want count and use it as the generation number of
+  // this retry operation. if this retry generation is > the last retry
+  // generation, then do the lock retries.  otherwise, no lock retries
+  // are needed.
+  if ((my_retry_want - 1) == info->retry_done) {
+    for (;;) {
+      if (!info->running_retry) {
+        info->running_retry = true;
+        info->retry_done = info->retry_want;
+        toku_mutex_unlock(&info->retry_mutex);
+        retry_all_lock_requests_info(info, lock_wait_callback, callback_arg);
+        if (after_retry_all_test_callback) after_retry_all_test_callback();
+        toku_mutex_lock(&info->retry_mutex);
+        info->running_retry = false;
+        toku_cond_broadcast(&info->retry_cv);
+        break;
+      } else {
+        toku_cond_wait(&info->retry_cv, &info->retry_mutex);
+      }
+    }
+  }
+  toku_mutex_unlock(&info->retry_mutex);
+}
+
+void lock_request::retry_all_lock_requests_info(
+    lt_lock_request_info *info,
+    void (*lock_wait_callback)(void *, lock_wait_infos *), void *callback_arg) {
+  toku_external_mutex_lock(&info->mutex);
+  // retry all of the pending lock requests.
+  lock_wait_infos conflicts_collector;
+  for (uint32_t i = 0; i < info->pending_lock_requests.size();) {
+    lock_request *request;
+    int r = info->pending_lock_requests.fetch(i, &request);
+    invariant_zero(r);
+
+    // retry the lock request. if it didn't succeed,
+    // move on to the next lock request. otherwise
+    // the request is gone from the list so we may
+    // read the i'th entry for the next one.
+    r = request->retry(&conflicts_collector);
+    if (r != 0) {
+      i++;
+    }
+  }
+
+  // call report_waits while holding the pending queue lock since
+  // the waiter object is still valid while it's in the queue
+  report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+
+  // future threads should only retry lock requests if some still exist
+  info->should_retry_lock_requests = info->pending_lock_requests.size() > 0;
+  toku_external_mutex_unlock(&info->mutex);
+}
+
+void lock_request::add_conflicts_to_waits(txnid_set *conflicts,
+                                          lock_wait_infos *wait_conflicts) {
+  wait_conflicts->push_back({m_lt, get_txnid(), m_extra, {}});
+  uint32_t num_conflicts = conflicts->size();
+  for (uint32_t i = 0; i < num_conflicts; i++) {
+    wait_conflicts->back().waitees.push_back(conflicts->get(i));
+  }
+}
+
+void lock_request::report_waits(lock_wait_infos *wait_conflicts,
+                                void (*lock_wait_callback)(void *,
+                                                           lock_wait_infos *),
+                                void *callback_arg) {
+  if (lock_wait_callback) (*lock_wait_callback)(callback_arg, wait_conflicts);
+}
+
+void *lock_request::get_extra(void) const { return m_extra; }
+
+void lock_request::kill_waiter(void) {
+  remove_from_lock_requests();
+  complete(DB_LOCK_NOTGRANTED);
+  toku_external_cond_broadcast(&m_wait_cond);
+}
+
+void lock_request::kill_waiter(locktree *lt, void *extra) {
+  lt_lock_request_info *info = lt->get_lock_request_info();
+  toku_external_mutex_lock(&info->mutex);
+  for (uint32_t i = 0; i < info->pending_lock_requests.size(); i++) {
+    lock_request *request;
+    int r = info->pending_lock_requests.fetch(i, &request);
+    if (r == 0 && request->get_extra() == extra) {
+      request->kill_waiter();
+      break;
+    }
+  }
+  toku_external_mutex_unlock(&info->mutex);
+}
+
+// find another lock request by txnid. must hold the mutex.
+lock_request *lock_request::find_lock_request(const TXNID &txnid) {
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      txnid, &request, nullptr);
+  if (r != 0) {
+    request = nullptr;
+  }
+  return request;
+}
+
+// insert this lock request into the locktree's set. must hold the mutex.
+void lock_request::insert_into_lock_requests(void) {
+  uint32_t idx;
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      m_txnid, &request, &idx);
+  invariant(r == DB_NOTFOUND);
+  r = m_info->pending_lock_requests.insert_at(this, idx);
+  invariant_zero(r);
+  m_info->pending_is_empty = false;
+}
+
+// remove this lock request from the locktree's set. must hold the mutex.
+void lock_request::remove_from_lock_requests(void) {
+  uint32_t idx;
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      m_txnid, &request, &idx);
+  invariant_zero(r);
+  invariant(request == this);
+  r = m_info->pending_lock_requests.delete_at(idx);
+  invariant_zero(r);
+  if (m_info->pending_lock_requests.size() == 0)
+    m_info->pending_is_empty = true;
+}
+
+int lock_request::find_by_txnid(lock_request *const &request,
+                                const TXNID &txnid) {
+  TXNID request_txnid = request->m_txnid;
+  if (request_txnid < txnid) {
+    return -1;
+  } else if (request_txnid == txnid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+void lock_request::set_start_test_callback(void (*f)(void)) {
+  m_start_test_callback = f;
+}
+
+void lock_request::set_start_before_pending_test_callback(void (*f)(void)) {
+  m_start_before_pending_test_callback = f;
+}
+
+void lock_request::set_retry_test_callback(void (*f)(void)) {
+  m_retry_test_callback = f;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
new file mode 100644
index 000000000..d30e1e2ca
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
@@ -0,0 +1,255 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_pthread.h"
+#include "locktree.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Information about a lock wait
+struct lock_wait_info {
+  locktree *ltree;  // the tree where wait happens
+  TXNID waiter;     // the waiting transaction
+  void *m_extra;    // lock_request's m_extra
+
+  // The transactions that are waited for.
+  std::vector<TXNID> waitees;
+};
+
+typedef std::vector<lock_wait_info> lock_wait_infos;
+
+// A lock request contains the db, the key range, the lock type, and
+// the transaction id that describes a potential row range lock.
+//
+// the typical use case is:
+// - initialize a lock request
+// - start to try to acquire the lock
+// - do something else
+// - wait for the lock request to be resolved on a timed condition
+// - destroy the lock request
+// a lock request is resolved when its state is no longer pending, or
+// when it becomes granted, or timedout, or deadlocked. when resolved, the
+// state of the lock request is changed and any waiting threads are awakened.
+
+class lock_request {
+ public:
+  enum type { UNKNOWN, READ, WRITE };
+
+  // effect: Initializes a lock request.
+  void create(toku_external_mutex_factory_t mutex_factory);
+
+  // effect: Destroys a lock request.
+  void destroy(void);
+
+  // effect: Resets the lock request parameters, allowing it to be reused.
+  // requires: Lock request was already created at some point
+  void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key,
+           type lock_type, bool big_txn, void *extra = nullptr);
+
+  // effect: Tries to acquire a lock described by this lock request.
+  // returns: The return code of locktree::acquire_[write,read]_lock()
+  //          or DB_LOCK_DEADLOCK if this request would end up deadlocked.
+  int start(void);
+
+  // effect: Sleeps until either the request is granted or the wait time
+  // expires. returns: The return code of locktree::acquire_[write,read]_lock()
+  //          or simply DB_LOCK_NOTGRANTED if the wait time expired.
+  int wait(uint64_t wait_time_ms);
+  int wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+           int (*killed_callback)(void),
+           void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+           void *callback_arg = nullptr);
+
+  // return: left end-point of the lock range
+  const DBT *get_left_key(void) const;
+
+  // return: right end-point of the lock range
+  const DBT *get_right_key(void) const;
+
+  // return: the txnid waiting for a lock
+  TXNID get_txnid(void) const;
+
+  // return: when this lock request started, as milliseconds from epoch
+  uint64_t get_start_time(void) const;
+
+  // return: which txnid is blocking this request (there may be more, though)
+  TXNID get_conflicting_txnid(void) const;
+
+  // effect: Retries all of the lock requests for the given locktree.
+  //         Any lock requests successfully restarted is completed and woken
+  //         up.
+  //         The rest remain pending.
+  static void retry_all_lock_requests(
+      locktree *lt,
+      void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+      void *callback_arg = nullptr,
+      void (*after_retry_test_callback)(void) = nullptr);
+  static void retry_all_lock_requests_info(
+      lt_lock_request_info *info,
+      void (*lock_wait_callback)(void *, lock_wait_infos *),
+      void *callback_arg);
+
+  void set_start_test_callback(void (*f)(void));
+  void set_start_before_pending_test_callback(void (*f)(void));
+  void set_retry_test_callback(void (*f)(void));
+
+  void *get_extra(void) const;
+
+  void kill_waiter(void);
+  static void kill_waiter(locktree *lt, void *extra);
+
+ private:
+  enum state {
+    UNINITIALIZED,
+    INITIALIZED,
+    PENDING,
+    COMPLETE,
+    DESTROYED,
+  };
+
+  // The keys for a lock request are stored "unowned" in m_left_key
+  // and m_right_key. When the request is about to go to sleep, it
+  // copies these keys and stores them in m_left_key_copy etc and
+  // sets the temporary pointers to null.
+  TXNID m_txnid;
+  TXNID m_conflicting_txnid;
+  uint64_t m_start_time;
+  const DBT *m_left_key;
+  const DBT *m_right_key;
+  DBT m_left_key_copy;
+  DBT m_right_key_copy;
+
+  // The lock request type and associated locktree
+  type m_type;
+  locktree *m_lt;
+
+  // If the lock request is in the completed state, then its
+  // final return value is stored in m_complete_r
+  int m_complete_r;
+  state m_state;
+
+  toku_external_cond_t m_wait_cond;
+
+  bool m_big_txn;
+
+  // the lock request info state stored in the
+  // locktree that this lock request is for.
+  struct lt_lock_request_info *m_info;
+
+  void *m_extra;
+
+  // effect: tries again to acquire the lock described by this lock request
+  // returns: 0 if retrying the request succeeded and is now complete
+  int retry(lock_wait_infos *collector);
+
+  void complete(int complete_r);
+
+  // effect: Finds another lock request by txnid.
+  // requires: The lock request info mutex is held
+  lock_request *find_lock_request(const TXNID &txnid);
+
+  // effect: Insert this lock request into the locktree's set.
+  // requires: the locktree's mutex is held
+  void insert_into_lock_requests(void);
+
+  // effect: Removes this lock request from the locktree's set.
+  // requires: The lock request info mutex is held
+  void remove_from_lock_requests(void);
+
+  // effect: Asks this request's locktree which txnids are preventing
+  //         us from getting the lock described by this request.
+  // returns: conflicts is populated with the txnid's that this request
+  //          is blocked on
+  void get_conflicts(txnid_set *conflicts);
+
+  // effect: Builds a wait-for-graph for this lock request and the given
+  // conflict set
+  void build_wait_graph(wfg *wait_graph, const txnid_set &conflicts);
+
+  // returns: True if this lock request is in deadlock with the given conflicts
+  // set
+  bool deadlock_exists(const txnid_set &conflicts);
+
+  void copy_keys(void);
+
+  static int find_by_txnid(lock_request *const &request, const TXNID &txnid);
+
+  // Report list of conflicts to lock wait callback.
+  static void report_waits(lock_wait_infos *wait_conflicts,
+                           void (*lock_wait_callback)(void *,
+                                                      lock_wait_infos *),
+                           void *callback_arg);
+  void add_conflicts_to_waits(txnid_set *conflicts,
+                              lock_wait_infos *wait_conflicts);
+
+  void (*m_start_test_callback)(void);
+  void (*m_start_before_pending_test_callback)(void);
+  void (*m_retry_test_callback)(void);
+
+ public:
+  std::function<void(TXNID, bool, const DBT *, const DBT *)> m_deadlock_cb;
+
+  friend class lock_request_unit_test;
+};
+// PORT: lock_request is not a POD anymore due to use of toku_external_cond_t
+//  This is ok as the PODness is not really required: lock_request objects are
+//  not moved in memory or anything.
+// ENSURE_POD(lock_request);
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
new file mode 100644
index 000000000..3d6a590c7
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
@@ -0,0 +1,1023 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "locktree.h"
+
+#include <memory.h>
+
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+#include "../util/growable_array.h"
+#include "range_buffer.h"
+
+// including the concurrent_tree here expands the templates
+// and "defines" the implementation, so we do it here in
+// the locktree source file instead of the header.
+#include "concurrent_tree.h"
+
+namespace toku {
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given descriptor
+// and comparison fn.
+//
+// Each locktree has a reference count which it manages
+// but does nothing based on the value of the reference count - it is
+// up to the user of the locktree to destroy it when it sees fit.
+
+void locktree::create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+                      const comparator &cmp,
+                      toku_external_mutex_factory_t mutex_factory) {
+  m_mgr = mgr;
+  m_dict_id = dict_id;
+
+  m_cmp.create_from(cmp);
+  m_reference_count = 1;
+  m_userdata = nullptr;
+
+  XCALLOC(m_rangetree);
+  m_rangetree->create(&m_cmp);
+
+  m_sto_txnid = TXNID_NONE;
+  m_sto_buffer.create();
+  m_sto_score = STO_SCORE_THRESHOLD;
+  m_sto_end_early_count = 0;
+  m_sto_end_early_time = 0;
+
+  m_escalation_barrier = [](const DBT *, const DBT *, void *) -> bool {
+    return false;
+  };
+
+  m_lock_request_info.init(mutex_factory);
+}
+
+void locktree::set_escalation_barrier_func(
+    lt_escalation_barrier_check_func func, void *extra) {
+  m_escalation_barrier = func;
+  m_escalation_barrier_arg = extra;
+}
+
+void lt_lock_request_info::init(toku_external_mutex_factory_t mutex_factory) {
+  pending_lock_requests.create();
+  pending_is_empty = true;
+  toku_external_mutex_init(mutex_factory, &mutex);
+  retry_want = retry_done = 0;
+  ZERO_STRUCT(counters);
+  ZERO_STRUCT(retry_mutex);
+  toku_mutex_init(locktree_request_info_retry_mutex_key, &retry_mutex, nullptr);
+  toku_cond_init(locktree_request_info_retry_cv_key, &retry_cv, nullptr);
+  running_retry = false;
+
+  TOKU_VALGRIND_HG_DISABLE_CHECKING(&pending_is_empty,
+                                    sizeof(pending_is_empty));
+  TOKU_DRD_IGNORE_VAR(pending_is_empty);
+}
+
+void locktree::destroy(void) {
+  invariant(m_reference_count == 0);
+  invariant(m_lock_request_info.pending_lock_requests.size() == 0);
+  m_cmp.destroy();
+  m_rangetree->destroy();
+  toku_free(m_rangetree);
+  m_sto_buffer.destroy();
+  m_lock_request_info.destroy();
+}
+
+void lt_lock_request_info::destroy(void) {
+  pending_lock_requests.destroy();
+  toku_external_mutex_destroy(&mutex);
+  toku_mutex_destroy(&retry_mutex);
+  toku_cond_destroy(&retry_cv);
+}
+
+void locktree::add_reference(void) {
+  (void)toku_sync_add_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::release_reference(void) {
+  return toku_sync_sub_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::get_reference_count(void) { return m_reference_count; }
+
+// a container for a range/txnid pair
+struct row_lock {
+  keyrange range;
+  TXNID txnid;
+  bool is_shared;
+  TxnidVector *owners;
+};
+
+// iterate over a locked keyrange and copy out all of the data,
+// storing each row lock into the given growable array. the
+// caller does not own the range inside the returned row locks,
+// so remove from the tree with care using them as keys.
+static void iterate_and_get_overlapping_row_locks(
+    const concurrent_tree::locked_keyrange *lkr,
+    GrowableArray<row_lock> *row_locks) {
+  struct copy_fn_obj {
+    GrowableArray<row_lock> *row_locks;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      row_lock lock = {.range = range,
+                       .txnid = txnid,
+                       .is_shared = is_shared,
+                       .owners = owners};
+      row_locks->push(lock);
+      return true;
+    }
+  } copy_fn;
+  copy_fn.row_locks = row_locks;
+  lkr->iterate(&copy_fn);
+}
+
+// given a txnid and a set of overlapping row locks, determine
+// which txnids are conflicting, and store them in the conflicts
+// set, if given.
+static bool determine_conflicting_txnids(
+    const GrowableArray<row_lock> &row_locks, const TXNID &txnid,
+    txnid_set *conflicts) {
+  bool conflicts_exist = false;
+  const size_t num_overlaps = row_locks.get_size();
+  for (size_t i = 0; i < num_overlaps; i++) {
+    const row_lock lock = row_locks.fetch_unchecked(i);
+    const TXNID other_txnid = lock.txnid;
+    if (other_txnid != txnid) {
+      if (conflicts) {
+        if (other_txnid == TXNID_SHARED) {
+          // Add all shared lock owners, except this transaction.
+          for (TXNID shared_id : *lock.owners) {
+            if (shared_id != txnid) conflicts->add(shared_id);
+          }
+        } else {
+          conflicts->add(other_txnid);
+        }
+      }
+      conflicts_exist = true;
+    }
+  }
+  return conflicts_exist;
+}
+
+// how much memory does a row lock take up in a concurrent tree?
+static uint64_t row_lock_size_in_tree(const row_lock &lock) {
+  const uint64_t overhead = concurrent_tree::get_insertion_memory_overhead();
+  return lock.range.get_memory_size() + overhead;
+}
+
+// remove and destroy the given row lock from the locked keyrange,
+// then notify the memory tracker of the newly freed lock.
+static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr,
+                                      const row_lock &lock, TXNID txnid,
+                                      locktree_manager *mgr) {
+  const uint64_t mem_released = row_lock_size_in_tree(lock);
+  lkr->remove(lock.range, txnid);
+  if (mgr != nullptr) {
+    mgr->note_mem_released(mem_released);
+  }
+}
+
+// insert a row lock into the locked keyrange, then notify
+// the memory tracker of this newly acquired lock.
+static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr,
+                                      const row_lock &lock,
+                                      locktree_manager *mgr) {
+  uint64_t mem_used = row_lock_size_in_tree(lock);
+  lkr->insert(lock.range, lock.txnid, lock.is_shared);
+  if (mgr != nullptr) {
+    mgr->note_mem_used(mem_used);
+  }
+}
+
+void locktree::sto_begin(TXNID txnid) {
+  invariant(m_sto_txnid == TXNID_NONE);
+  invariant(m_sto_buffer.is_empty());
+  m_sto_txnid = txnid;
+}
+
+void locktree::sto_append(const DBT *left_key, const DBT *right_key,
+                          bool is_write_request) {
+  uint64_t buffer_mem, delta;
+
+  // psergey: the below two lines do not make any sense
+  // (and it's the same in upstream TokuDB)
+  keyrange range;
+  range.create(left_key, right_key);
+
+  buffer_mem = m_sto_buffer.total_memory_size();
+  m_sto_buffer.append(left_key, right_key, is_write_request);
+  delta = m_sto_buffer.total_memory_size() - buffer_mem;
+  if (m_mgr != nullptr) {
+    m_mgr->note_mem_used(delta);
+  }
+}
+
+void locktree::sto_end(void) {
+  uint64_t mem_size = m_sto_buffer.total_memory_size();
+  if (m_mgr != nullptr) {
+    m_mgr->note_mem_released(mem_size);
+  }
+  m_sto_buffer.destroy();
+  m_sto_buffer.create();
+  m_sto_txnid = TXNID_NONE;
+}
+
+void locktree::sto_end_early_no_accounting(void *prepared_lkr) {
+  sto_migrate_buffer_ranges_to_tree(prepared_lkr);
+  sto_end();
+  toku_unsafe_set(m_sto_score, 0);
+}
+
+void locktree::sto_end_early(void *prepared_lkr) {
+  m_sto_end_early_count++;
+
+  tokutime_t t0 = toku_time_now();
+  sto_end_early_no_accounting(prepared_lkr);
+  tokutime_t t1 = toku_time_now();
+
+  m_sto_end_early_time += (t1 - t0);
+}
+
+void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) {
+  // There should be something to migrate, and nothing in the rangetree.
+  invariant(!m_sto_buffer.is_empty());
+  invariant(m_rangetree->is_empty());
+
+  concurrent_tree sto_rangetree;
+  concurrent_tree::locked_keyrange sto_lkr;
+  sto_rangetree.create(&m_cmp);
+
+  // insert all of the ranges from the single txnid buffer into a new rangtree
+  range_buffer::iterator iter(&m_sto_buffer);
+  range_buffer::iterator::record rec;
+  while (iter.current(&rec)) {
+    sto_lkr.prepare(&sto_rangetree);
+    int r = acquire_lock_consolidated(&sto_lkr, m_sto_txnid, rec.get_left_key(),
+                                      rec.get_right_key(),
+                                      rec.get_exclusive_flag(), nullptr);
+    invariant_zero(r);
+    sto_lkr.release();
+    iter.next();
+  }
+
+  // Iterate the newly created rangetree and insert each range into the
+  // locktree's rangetree, on behalf of the old single txnid.
+  struct migrate_fn_obj {
+    concurrent_tree::locked_keyrange *dst_lkr;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      // There can't be multiple owners in STO mode
+      invariant_zero(owners);
+      dst_lkr->insert(range, txnid, is_shared);
+      return true;
+    }
+  } migrate_fn;
+  migrate_fn.dst_lkr =
+      static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+  sto_lkr.prepare(&sto_rangetree);
+  sto_lkr.iterate(&migrate_fn);
+  sto_lkr.remove_all();
+  sto_lkr.release();
+  sto_rangetree.destroy();
+  invariant(!m_rangetree->is_empty());
+}
+
+bool locktree::sto_try_acquire(void *prepared_lkr, TXNID txnid,
+                               const DBT *left_key, const DBT *right_key,
+                               bool is_write_request) {
+  if (m_rangetree->is_empty() && m_sto_buffer.is_empty() &&
+      toku_unsafe_fetch(m_sto_score) >= STO_SCORE_THRESHOLD) {
+    // We can do the optimization because the rangetree is empty, and
+    // we know its worth trying because the sto score is big enough.
+    sto_begin(txnid);
+  } else if (m_sto_txnid != TXNID_NONE) {
+    // We are currently doing the optimization. Check if we need to cancel
+    // it because a new txnid appeared, or if the current single txnid has
+    // taken too many locks already.
+    if (m_sto_txnid != txnid ||
+        m_sto_buffer.get_num_ranges() > STO_BUFFER_MAX_SIZE) {
+      sto_end_early(prepared_lkr);
+    }
+  }
+
+  // At this point the sto txnid is properly set. If it is valid, then
+  // this txnid can append its lock to the sto buffer successfully.
+  if (m_sto_txnid != TXNID_NONE) {
+    invariant(m_sto_txnid == txnid);
+    sto_append(left_key, right_key, is_write_request);
+    return true;
+  } else {
+    invariant(m_sto_buffer.is_empty());
+    return false;
+  }
+}
+
+/*
+  Do the same as iterate_and_get_overlapping_row_locks does, but also check for
+  this:
+    The set of overlapping rows locks consists of just one read-only shared
+    lock with the same endpoints as specified (in that case, we can just add
+    ourselves into that list)
+
+  @return true - One compatible shared lock
+         false - Otherwise
+*/
+static bool iterate_and_get_overlapping_row_locks2(
+    const concurrent_tree::locked_keyrange *lkr, const DBT *left_key,
+    const DBT *right_key, comparator *cmp, TXNID,
+    GrowableArray<row_lock> *row_locks) {
+  struct copy_fn_obj {
+    GrowableArray<row_lock> *row_locks;
+    bool first_call = true;
+    bool matching_lock_found = false;
+    const DBT *left_key, *right_key;
+    comparator *cmp;
+
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      if (first_call) {
+        first_call = false;
+        if (is_shared && !(*cmp)(left_key, range.get_left_key()) &&
+            !(*cmp)(right_key, range.get_right_key())) {
+          matching_lock_found = true;
+        }
+      } else {
+        // if we see multiple matching locks, it doesn't matter whether
+        // the first one was matching.
+        matching_lock_found = false;
+      }
+      row_lock lock = {.range = range,
+                       .txnid = txnid,
+                       .is_shared = is_shared,
+                       .owners = owners};
+      row_locks->push(lock);
+      return true;
+    }
+  } copy_fn;
+  copy_fn.row_locks = row_locks;
+  copy_fn.left_key = left_key;
+  copy_fn.right_key = right_key;
+  copy_fn.cmp = cmp;
+  lkr->iterate(&copy_fn);
+  return copy_fn.matching_lock_found;
+}
+
+// try to acquire a lock and consolidate it with existing locks if possible
+// param: lkr, a prepared locked keyrange
+// return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist.
+int locktree::acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+                                        const DBT *left_key,
+                                        const DBT *right_key,
+                                        bool is_write_request,
+                                        txnid_set *conflicts) {
+  int r = 0;
+  concurrent_tree::locked_keyrange *lkr;
+
+  keyrange requested_range;
+  requested_range.create(left_key, right_key);
+  lkr = static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+  lkr->acquire(requested_range);
+
+  // copy out the set of overlapping row locks.
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  bool matching_shared_lock_found = false;
+
+  if (is_write_request)
+    iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks);
+  else {
+    matching_shared_lock_found = iterate_and_get_overlapping_row_locks2(
+        lkr, left_key, right_key, &m_cmp, txnid, &overlapping_row_locks);
+    // psergey-todo: what to do now? So, we have figured we have just one
+    // shareable lock. Need to add us into it as an owner but the lock
+    // pointer cannot be kept?
+    // A: use find_node_with_overlapping_child(key_range, nullptr);
+    //  then, add ourselves to the owner list.
+    // Dont' foreget to release the subtree after that.
+  }
+
+  if (matching_shared_lock_found) {
+    // there is just one non-confliting matching shared lock.
+    //  we are hilding a lock on it (see acquire() call above).
+    //  we need to modify it to indicate there is another locker...
+    if (lkr->add_shared_owner(requested_range, txnid)) {
+      // Pretend shared lock uses as much memory.
+      row_lock new_lock = {.range = requested_range,
+                           .txnid = txnid,
+                           .is_shared = false,
+                           .owners = nullptr};
+      uint64_t mem_used = row_lock_size_in_tree(new_lock);
+      if (m_mgr) {
+        m_mgr->note_mem_used(mem_used);
+      }
+    }
+    requested_range.destroy();
+    overlapping_row_locks.deinit();
+    return 0;
+  }
+
+  size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+  // if any overlapping row locks conflict with this request, bail out.
+
+  bool conflicts_exist =
+      determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+  if (!conflicts_exist) {
+    // there are no conflicts, so all of the overlaps are for the requesting
+    // txnid. so, we must consolidate all existing overlapping ranges and the
+    // requested range into one dominating range. then we insert the dominating
+    // range.
+    bool all_shared = !is_write_request;
+    for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+      row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i);
+      invariant(overlapping_lock.txnid == txnid);
+      requested_range.extend(m_cmp, overlapping_lock.range);
+      remove_row_lock_from_tree(lkr, overlapping_lock, TXNID_ANY, m_mgr);
+      all_shared = all_shared && overlapping_lock.is_shared;
+    }
+
+    row_lock new_lock = {.range = requested_range,
+                         .txnid = txnid,
+                         .is_shared = all_shared,
+                         .owners = nullptr};
+    insert_row_lock_into_tree(lkr, new_lock, m_mgr);
+  } else {
+    r = DB_LOCK_NOTGRANTED;
+  }
+
+  requested_range.destroy();
+  overlapping_row_locks.deinit();
+  return r;
+}
+
+// acquire a lock in the given key range, inclusive. if successful,
+// return 0. otherwise, populate the conflicts txnid_set with the set of
+// transactions that conflict with this request.
+int locktree::acquire_lock(bool is_write_request, TXNID txnid,
+                           const DBT *left_key, const DBT *right_key,
+                           txnid_set *conflicts) {
+  int r = 0;
+
+  // we are only supporting write locks for simplicity
+  // invariant(is_write_request);
+
+  // acquire and prepare a locked keyrange over the requested range.
+  // prepare is a serialzation point, so we take the opportunity to
+  // try the single txnid optimization first.
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+
+  bool acquired =
+      sto_try_acquire(&lkr, txnid, left_key, right_key, is_write_request);
+  if (!acquired) {
+    r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key,
+                                  is_write_request, conflicts);
+  }
+
+  lkr.release();
+  return r;
+}
+
+int locktree::try_acquire_lock(bool is_write_request, TXNID txnid,
+                               const DBT *left_key, const DBT *right_key,
+                               txnid_set *conflicts, bool big_txn) {
+  // All ranges in the locktree must have left endpoints <= right endpoints.
+  // Range comparisons rely on this fact, so we make a paranoid invariant here.
+  paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+  int r = m_mgr == nullptr ? 0 : m_mgr->check_current_lock_constraints(big_txn);
+  if (r == 0) {
+    r = acquire_lock(is_write_request, txnid, left_key, right_key, conflicts);
+  }
+  return r;
+}
+
+// the locktree silently upgrades read locks to write locks for simplicity
+int locktree::acquire_read_lock(TXNID txnid, const DBT *left_key,
+                                const DBT *right_key, txnid_set *conflicts,
+                                bool big_txn) {
+  return try_acquire_lock(false, txnid, left_key, right_key, conflicts,
+                          big_txn);
+}
+
+int locktree::acquire_write_lock(TXNID txnid, const DBT *left_key,
+                                 const DBT *right_key, txnid_set *conflicts,
+                                 bool big_txn) {
+  return try_acquire_lock(true, txnid, left_key, right_key, conflicts, big_txn);
+}
+
+// typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+// TXNID txnid);
+void locktree::dump_locks(void *cdata, dump_callback cb) {
+  concurrent_tree::locked_keyrange lkr;
+  keyrange range;
+  range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+
+  lkr.prepare(m_rangetree);
+  lkr.acquire(range);
+
+  TXNID sto_txn;
+  if ((sto_txn = toku_unsafe_fetch(m_sto_txnid)) != TXNID_NONE) {
+    // insert all of the ranges from the single txnid buffer into a new rangtree
+    range_buffer::iterator iter(&m_sto_buffer);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      (*cb)(cdata, rec.get_left_key(), rec.get_right_key(), sto_txn,
+            !rec.get_exclusive_flag(), nullptr);
+      iter.next();
+    }
+  } else {
+    GrowableArray<row_lock> all_locks;
+    all_locks.init();
+    iterate_and_get_overlapping_row_locks(&lkr, &all_locks);
+
+    const size_t n_locks = all_locks.get_size();
+    for (size_t i = 0; i < n_locks; i++) {
+      const row_lock lock = all_locks.fetch_unchecked(i);
+      (*cb)(cdata, lock.range.get_left_key(), lock.range.get_right_key(),
+            lock.txnid, lock.is_shared, lock.owners);
+    }
+    all_locks.deinit();
+  }
+  lkr.release();
+  range.destroy();
+}
+
+void locktree::get_conflicts(bool is_write_request, TXNID txnid,
+                             const DBT *left_key, const DBT *right_key,
+                             txnid_set *conflicts) {
+  // because we only support write locks, ignore this bit for now.
+  (void)is_write_request;
+
+  // preparing and acquire a locked keyrange over the range
+  keyrange range;
+  range.create(left_key, right_key);
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+  lkr.acquire(range);
+
+  // copy out the set of overlapping row locks and determine the conflicts
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+
+  // we don't care if conflicts exist. we just want the conflicts set populated.
+  (void)determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+
+  lkr.release();
+  overlapping_row_locks.deinit();
+  range.destroy();
+}
+
+// Effect:
+//  For each range in the lock tree that overlaps the given range and has
+//  the given txnid, remove it.
+// Rationale:
+//  In the common case, there is only the range [left_key, right_key] and
+//  it is associated with txnid, so this is a single tree delete.
+//
+//  However, consolidation and escalation change the objects in the tree
+//  without telling the txn anything.  In this case, the txn may own a
+//  large range lock that represents its ownership of many smaller range
+//  locks.  For example, the txn may think it owns point locks on keys 1,
+//  2, and 3, but due to escalation, only the object [1,3] exists in the
+//  tree.
+//
+//  The first call for a small lock will remove the large range lock, and
+//  the rest of the calls should do nothing.  After the first release,
+//  another thread can acquire one of the locks that the txn thinks it
+//  still owns.  That's ok, because the txn doesn't want it anymore (it
+//  unlocks everything at once), but it may find a lock that it does not
+//  own.
+//
+//  In our example, the txn unlocks key 1, which actually removes the
+//  whole lock [1,3].  Now, someone else can lock 2 before our txn gets
+//  around to unlocking 2, so we should not remove that lock.
+void locktree::remove_overlapping_locks_for_txnid(TXNID txnid,
+                                                  const DBT *left_key,
+                                                  const DBT *right_key) {
+  keyrange release_range;
+  release_range.create(left_key, right_key);
+
+  // acquire and prepare a locked keyrange over the release range
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+  lkr.acquire(release_range);
+
+  // copy out the set of overlapping row locks.
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+  size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+  for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+    row_lock lock = overlapping_row_locks.fetch_unchecked(i);
+    // If this isn't our lock, that's ok, just don't remove it.
+    // See rationale above.
+    // psergey-todo: for shared locks, just remove ourselves from the
+    //               owners.
+    if (lock.txnid == txnid || (lock.owners && lock.owners->contains(txnid))) {
+      remove_row_lock_from_tree(&lkr, lock, txnid, m_mgr);
+    }
+  }
+
+  lkr.release();
+  overlapping_row_locks.deinit();
+  release_range.destroy();
+}
+
+bool locktree::sto_txnid_is_valid_unsafe(void) const {
+  return toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE;
+}
+
+int locktree::sto_get_score_unsafe(void) const {
+  return toku_unsafe_fetch(m_sto_score);
+}
+
+bool locktree::sto_try_release(TXNID txnid) {
+  bool released = false;
+  if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+    // check the bit again with a prepared locked keyrange,
+    // which protects the optimization bits and rangetree data
+    concurrent_tree::locked_keyrange lkr;
+    lkr.prepare(m_rangetree);
+    if (m_sto_txnid != TXNID_NONE) {
+      // this txnid better be the single txnid on this locktree,
+      // or else we are in big trouble (meaning the logic is broken)
+      invariant(m_sto_txnid == txnid);
+      invariant(m_rangetree->is_empty());
+      sto_end();
+      released = true;
+    }
+    lkr.release();
+  }
+  return released;
+}
+
+// release all of the locks for a txnid whose endpoints are pairs
+// in the given range buffer.
+void locktree::release_locks(TXNID txnid, const range_buffer *ranges,
+                             bool all_trx_locks_hint) {
+  // try the single txn optimization. if it worked, then all of the
+  // locks are already released, otherwise we need to do it here.
+  bool released;
+  if (all_trx_locks_hint) {
+    // This will release all of the locks the transaction is holding
+    released = sto_try_release(txnid);
+  } else {
+    /*
+      psergey: we are asked to release *Some* of the locks the transaction
+      is holding.
+      We could try doing that without leaving the STO mode, but right now,
+      the easiest way is to exit the STO mode and let the non-STO code path
+      handle it.
+    */
+    if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+      // check the bit again with a prepared locked keyrange,
+      // which protects the optimization bits and rangetree data
+      concurrent_tree::locked_keyrange lkr;
+      lkr.prepare(m_rangetree);
+      if (m_sto_txnid != TXNID_NONE) {
+        sto_end_early(&lkr);
+      }
+      lkr.release();
+    }
+    released = false;
+  }
+  if (!released) {
+    range_buffer::iterator iter(ranges);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      const DBT *left_key = rec.get_left_key();
+      const DBT *right_key = rec.get_right_key();
+      // All ranges in the locktree must have left endpoints <= right endpoints.
+      // Range comparisons rely on this fact, so we make a paranoid invariant
+      // here.
+      paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+      remove_overlapping_locks_for_txnid(txnid, left_key, right_key);
+      iter.next();
+    }
+    // Increase the sto score slightly. Eventually it will hit
+    // the threshold and we'll try the optimization again. This
+    // is how a previously multithreaded system transitions into
+    // a single threaded system that benefits from the optimization.
+    if (toku_unsafe_fetch(m_sto_score) < STO_SCORE_THRESHOLD) {
+      toku_sync_fetch_and_add(&m_sto_score, 1);
+    }
+  }
+}
+
+// iterate over a locked keyrange and extract copies of the first N
+// row locks, storing each one into the given array of size N,
+// then removing each extracted lock from the locked keyrange.
+static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr,
+                                     locktree_manager *mgr, row_lock *row_locks,
+                                     int num_to_extract) {
+  struct extract_fn_obj {
+    int num_extracted;
+    int num_to_extract;
+    row_lock *row_locks;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      if (num_extracted < num_to_extract) {
+        row_lock lock;
+        lock.range.create_copy(range);
+        lock.txnid = txnid;
+        lock.is_shared = is_shared;
+        // deep-copy the set of owners:
+        if (owners)
+          lock.owners = new TxnidVector(*owners);
+        else
+          lock.owners = nullptr;
+        row_locks[num_extracted++] = lock;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  } extract_fn;
+
+  extract_fn.row_locks = row_locks;
+  extract_fn.num_to_extract = num_to_extract;
+  extract_fn.num_extracted = 0;
+  lkr->iterate(&extract_fn);
+
+  // now that the ranges have been copied out, complete
+  // the extraction by removing the ranges from the tree.
+  // use remove_row_lock_from_tree() so we properly track the
+  // amount of memory and number of locks freed.
+  int num_extracted = extract_fn.num_extracted;
+  invariant(num_extracted <= num_to_extract);
+  for (int i = 0; i < num_extracted; i++) {
+    remove_row_lock_from_tree(lkr, row_locks[i], TXNID_ANY, mgr);
+  }
+
+  return num_extracted;
+}
+
+// Store each newly escalated lock in a range buffer for appropriate txnid.
+// We'll rebuild the locktree by iterating over these ranges, and then we
+// can pass back each txnid/buffer pair individually through a callback
+// to notify higher layers that locks have changed.
+struct txnid_range_buffer {
+  TXNID txnid;
+  range_buffer buffer;
+
+  static int find_by_txnid(struct txnid_range_buffer *const &other_buffer,
+                           const TXNID &txnid) {
+    if (txnid < other_buffer->txnid) {
+      return -1;
+    } else if (other_buffer->txnid == txnid) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+};
+
+// escalate the locks in the locktree by merging adjacent
+// locks that have the same txnid into one larger lock.
+//
+// if there's only one txnid in the locktree then this
+// approach works well. if there are many txnids and each
+// has locks in a random/alternating order, then this does
+// not work so well.
+void locktree::escalate(lt_escalate_cb after_escalate_callback,
+                        void *after_escalate_callback_extra) {
+  omt<struct txnid_range_buffer *, struct txnid_range_buffer *> range_buffers;
+  range_buffers.create();
+
+  // prepare and acquire a locked keyrange on the entire locktree
+  concurrent_tree::locked_keyrange lkr;
+  keyrange infinite_range = keyrange::get_infinite_range();
+  lkr.prepare(m_rangetree);
+  lkr.acquire(infinite_range);
+
+  // if we're in the single txnid optimization, simply call it off.
+  // if you have to run escalation, you probably don't care about
+  // the optimization anyway, and this makes things easier.
+  if (m_sto_txnid != TXNID_NONE) {
+    // We are already accounting for this escalation time and
+    // count, so don't do it for sto_end_early too.
+    sto_end_early_no_accounting(&lkr);
+  }
+
+  // extract and remove batches of row locks from the locktree
+  int num_extracted;
+  const int num_row_locks_per_batch = 128;
+  row_lock *XCALLOC_N(num_row_locks_per_batch, extracted_buf);
+
+  // we always remove the "first" n because we are removing n
+  // each time we do an extraction. so this loops until its empty.
+  while ((num_extracted = extract_first_n_row_locks(
+              &lkr, m_mgr, extracted_buf, num_row_locks_per_batch)) > 0) {
+    int current_index = 0;
+    while (current_index < num_extracted) {
+      // every batch of extracted locks is in range-sorted order. search
+      // through them and merge adjacent locks with the same txnid into
+      // one dominating lock and save it to a set of escalated locks.
+      //
+      // first, find the index of the next row lock that
+      //  - belongs to a different txnid, or
+      //  - belongs to several txnids, or
+      //  - is a shared lock (we could potentially merge those but
+      //    currently we don't), or
+      //  - is across a lock escalation barrier.
+      int next_txnid_index = current_index + 1;
+
+      while (next_txnid_index < num_extracted &&
+             (extracted_buf[current_index].txnid ==
+              extracted_buf[next_txnid_index].txnid) &&
+             !extracted_buf[next_txnid_index].is_shared &&
+             !extracted_buf[next_txnid_index].owners &&
+             !m_escalation_barrier(
+                 extracted_buf[current_index].range.get_right_key(),
+                 extracted_buf[next_txnid_index].range.get_left_key(),
+                 m_escalation_barrier_arg)) {
+        next_txnid_index++;
+      }
+
+      // Create an escalated range for the current txnid that dominates
+      // each range between the current indext and the next txnid's index.
+      // const TXNID current_txnid = extracted_buf[current_index].txnid;
+      const DBT *escalated_left_key =
+          extracted_buf[current_index].range.get_left_key();
+      const DBT *escalated_right_key =
+          extracted_buf[next_txnid_index - 1].range.get_right_key();
+
+      // Try to find a range buffer for the current txnid. Create one if it
+      // doesn't exist. Then, append the new escalated range to the buffer. (If
+      // a lock is shared by multiple txnids, append it each of txnid's lists)
+      TxnidVector *owners_ptr;
+      TxnidVector singleton_owner;
+      if (extracted_buf[current_index].owners)
+        owners_ptr = extracted_buf[current_index].owners;
+      else {
+        singleton_owner.insert(extracted_buf[current_index].txnid);
+        owners_ptr = &singleton_owner;
+      }
+
+      for (auto cur_txnid : *owners_ptr) {
+        uint32_t idx;
+        struct txnid_range_buffer *existing_range_buffer;
+        int r =
+            range_buffers.find_zero<TXNID, txnid_range_buffer::find_by_txnid>(
+                cur_txnid, &existing_range_buffer, &idx);
+        if (r == DB_NOTFOUND) {
+          struct txnid_range_buffer *XMALLOC(new_range_buffer);
+          new_range_buffer->txnid = cur_txnid;
+          new_range_buffer->buffer.create();
+          new_range_buffer->buffer.append(
+              escalated_left_key, escalated_right_key,
+              !extracted_buf[current_index].is_shared);
+          range_buffers.insert_at(new_range_buffer, idx);
+        } else {
+          invariant_zero(r);
+          invariant(existing_range_buffer->txnid == cur_txnid);
+          existing_range_buffer->buffer.append(
+              escalated_left_key, escalated_right_key,
+              !extracted_buf[current_index].is_shared);
+        }
+      }
+
+      current_index = next_txnid_index;
+    }
+
+    // destroy the ranges copied during the extraction
+    for (int i = 0; i < num_extracted; i++) {
+      delete extracted_buf[i].owners;
+      extracted_buf[i].range.destroy();
+    }
+  }
+  toku_free(extracted_buf);
+
+  // Rebuild the locktree from each range in each range buffer,
+  // then notify higher layers that the txnid's locks have changed.
+  //
+  // (shared locks: if a lock was initially shared between transactions TRX1,
+  //  TRX2, etc, we will now try to acquire it acting on behalf on TRX1, on
+  //  TRX2, etc.  This will succeed and an identical shared lock will be
+  //  constructed)
+
+  invariant(m_rangetree->is_empty());
+  const uint32_t num_range_buffers = range_buffers.size();
+  for (uint32_t i = 0; i < num_range_buffers; i++) {
+    struct txnid_range_buffer *current_range_buffer;
+    int r = range_buffers.fetch(i, &current_range_buffer);
+    invariant_zero(r);
+    if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+      continue;
+
+    const TXNID current_txnid = current_range_buffer->txnid;
+    range_buffer::iterator iter(&current_range_buffer->buffer);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      keyrange range;
+      range.create(rec.get_left_key(), rec.get_right_key());
+      row_lock lock = {.range = range,
+                       .txnid = current_txnid,
+                       .is_shared = !rec.get_exclusive_flag(),
+                       .owners = nullptr};
+      insert_row_lock_into_tree(&lkr, lock, m_mgr);
+      iter.next();
+    }
+
+    // Notify higher layers that locks have changed for the current txnid
+    if (after_escalate_callback) {
+      after_escalate_callback(current_txnid, this, current_range_buffer->buffer,
+                              after_escalate_callback_extra);
+    }
+    current_range_buffer->buffer.destroy();
+  }
+
+  while (range_buffers.size() > 0) {
+    struct txnid_range_buffer *buffer;
+    int r = range_buffers.fetch(0, &buffer);
+    invariant_zero(r);
+    r = range_buffers.delete_at(0);
+    invariant_zero(r);
+    toku_free(buffer);
+  }
+  range_buffers.destroy();
+
+  lkr.release();
+}
+
+void *locktree::get_userdata(void) const { return m_userdata; }
+
+void locktree::set_userdata(void *userdata) { m_userdata = userdata; }
+
+struct lt_lock_request_info *locktree::get_lock_request_info(void) {
+  return &m_lock_request_info;
+}
+
+void locktree::set_comparator(const comparator &cmp) { m_cmp.inherit(cmp); }
+
+locktree_manager *locktree::get_manager(void) const { return m_mgr; }
+
+int locktree::compare(const locktree *lt) const {
+  if (m_dict_id.dictid < lt->m_dict_id.dictid) {
+    return -1;
+  } else if (m_dict_id.dictid == lt->m_dict_id.dictid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+DICTIONARY_ID locktree::get_dict_id() const { return m_dict_id; }
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
new file mode 100644
index 000000000..f0f4b042d
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
@@ -0,0 +1,580 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <atomic>
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_external_pthread.h"
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+// PORT #include <ft/ft-ops.h>  // just for DICTIONARY_ID..
+// PORT: ft-status for LTM_STATUS:
+#include "../ft/ft-status.h"
+
+struct DICTIONARY_ID {
+  uint64_t dictid;
+};
+
+#include "../util/omt.h"
+#include "range_buffer.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+class locktree;
+class locktree_manager;
+class lock_request;
+class concurrent_tree;
+
+typedef int (*lt_create_cb)(locktree *lt, void *extra);
+typedef void (*lt_destroy_cb)(locktree *lt);
+typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt,
+                               const range_buffer &buffer, void *extra);
+
+typedef bool (*lt_escalation_barrier_check_func)(const DBT *a, const DBT *b,
+                                                 void *extra);
+
+struct lt_counters {
+  uint64_t wait_count, wait_time;
+  uint64_t long_wait_count, long_wait_time;
+  uint64_t timeout_count;
+
+  void add(const lt_counters &rhs) {
+    wait_count += rhs.wait_count;
+    wait_time += rhs.wait_time;
+    long_wait_count += rhs.long_wait_count;
+    long_wait_time += rhs.long_wait_time;
+    timeout_count += rhs.timeout_count;
+  }
+};
+
+// Lock request state for some locktree
+struct lt_lock_request_info {
+  omt<lock_request *> pending_lock_requests;
+  std::atomic_bool pending_is_empty;
+  toku_external_mutex_t mutex;
+  bool should_retry_lock_requests;
+  lt_counters counters;
+  std::atomic_ullong retry_want;
+  unsigned long long retry_done;
+  toku_mutex_t retry_mutex;
+  toku_cond_t retry_cv;
+  bool running_retry;
+
+  void init(toku_external_mutex_factory_t mutex_factory);
+  void destroy(void);
+};
+
+// The locktree manager manages a set of locktrees, one for each open
+// dictionary. Locktrees are retrieved from the manager. When they are no
+// longer needed, they are be released by the user.
+class locktree_manager {
+ public:
+  // param: create_cb, called just after a locktree is first created.
+  //        destroy_cb, called just before a locktree is destroyed.
+  //        escalate_cb, called after a locktree is escalated (with extra
+  //        param)
+  void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+              lt_escalate_cb escalate_cb, void *extra,
+              toku_external_mutex_factory_t mutex_factory_arg);
+
+  void destroy(void);
+
+  size_t get_max_lock_memory(void);
+
+  int set_max_lock_memory(size_t max_lock_memory);
+
+  // effect: Get a locktree from the manager. If a locktree exists with the
+  // given
+  //         dict_id, it is referenced and then returned. If one did not exist,
+  //         it is created. It will use the comparator for comparing keys. The
+  //         on_create callback (passed to locktree_manager::create()) will be
+  //         called with the given extra parameter.
+  locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+                   void *on_create_extra);
+
+  void reference_lt(locktree *lt);
+
+  // effect: Releases one reference on a locktree. If the reference count
+  // transitions
+  //         to zero, the on_destroy callback is called before it gets
+  //         destroyed.
+  void release_lt(locktree *lt);
+
+  void get_status(LTM_STATUS status);
+
+  // effect: calls the iterate function on each pending lock request
+  // note: holds the manager's mutex
+  typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
+                                               TXNID txnid, const DBT *left_key,
+                                               const DBT *right_key,
+                                               TXNID blocking_txnid,
+                                               uint64_t start_time,
+                                               void *extra);
+  int iterate_pending_lock_requests(lock_request_iterate_callback cb,
+                                    void *extra);
+
+  // effect: Determines if too many locks or too much memory is being used,
+  //         Runs escalation on the manager if so.
+  // param: big_txn, if the current transaction is 'big' (has spilled rollback
+  // logs) returns: 0 if there enough resources to create a new lock, or
+  // TOKUDB_OUT_OF_LOCKS
+  //          if there are not enough resources and lock escalation failed to
+  //          free up enough resources for a new lock.
+  int check_current_lock_constraints(bool big_txn);
+
+  bool over_big_threshold(void);
+
+  void note_mem_used(uint64_t mem_used);
+
+  void note_mem_released(uint64_t mem_freed);
+
+  bool out_of_locks(void) const;
+
+  // Escalate all locktrees
+  void escalate_all_locktrees(void);
+
+  // Escalate a set of locktrees
+  void escalate_locktrees(locktree **locktrees, int num_locktrees);
+
+  // effect: calls the private function run_escalation(), only ok to
+  //         do for tests.
+  // rationale: to get better stress test coverage, we want a way to
+  //            deterministicly trigger lock escalation.
+  void run_escalation_for_test(void);
+  void run_escalation(void);
+
+  // Add time t to the escalator's wait time statistics
+  void add_escalator_wait_time(uint64_t t);
+
+  void kill_waiter(void *extra);
+
+ private:
+  static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
+
+  // tracks the current number of locks and lock memory
+  uint64_t m_max_lock_memory;
+  uint64_t m_current_lock_memory;
+
+  struct lt_counters m_lt_counters;
+
+  // the create and destroy callbacks for the locktrees
+  lt_create_cb m_lt_create_callback;
+  lt_destroy_cb m_lt_destroy_callback;
+  lt_escalate_cb m_lt_escalate_callback;
+  void *m_lt_escalate_callback_extra;
+
+  omt<locktree *> m_locktree_map;
+
+  toku_external_mutex_factory_t mutex_factory;
+
+  // the manager's mutex protects the locktree map
+  toku_mutex_t m_mutex;
+
+  void mutex_lock(void);
+
+  void mutex_unlock(void);
+
+  // Manage the set of open locktrees
+  locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
+  void locktree_map_put(locktree *lt);
+  void locktree_map_remove(locktree *lt);
+
+  static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
+
+  void escalator_init(void);
+  void escalator_destroy(void);
+
+  // statistics about lock escalation.
+  toku_mutex_t m_escalation_mutex;
+  uint64_t m_escalation_count;
+  tokutime_t m_escalation_time;
+  uint64_t m_escalation_latest_result;
+  uint64_t m_wait_escalation_count;
+  uint64_t m_wait_escalation_time;
+  uint64_t m_long_wait_escalation_count;
+  uint64_t m_long_wait_escalation_time;
+
+  // the escalator coordinates escalation on a set of locktrees for a bunch of
+  // threads
+  class locktree_escalator {
+   public:
+    void create(void);
+    void destroy(void);
+    void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+             void *extra);
+
+   private:
+    toku_mutex_t m_escalator_mutex;
+    toku_cond_t m_escalator_done;
+    bool m_escalator_running;
+  };
+
+  locktree_escalator m_escalator;
+
+  friend class manager_unit_test;
+};
+
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given comparator
+//
+// Locktrees are not created and destroyed by the user. Instead, they are
+// referenced and released using the locktree manager.
+//
+// A sample workflow looks like this:
+// - Create a manager.
+// - Get a locktree by dictionaroy id from the manager.
+// - Perform read/write lock acquision on the locktree, add references to
+//   the locktree using the manager, release locks, release references, etc.
+// - ...
+// - Release the final reference to the locktree. It will be destroyed.
+// - Destroy the manager.
+class locktree {
+ public:
+  // effect: Creates a locktree
+  void create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+              const comparator &cmp,
+              toku_external_mutex_factory_t mutex_factory);
+
+  void destroy(void);
+
+  // For thread-safe, external reference counting
+  void add_reference(void);
+
+  // requires: the reference count is > 0
+  // returns: the reference count, after decrementing it by one
+  uint32_t release_reference(void);
+
+  // returns: the current reference count
+  uint32_t get_reference_count(void);
+
+  // effect: Attempts to grant a read lock for the range of keys between
+  // [left_key, right_key]. returns: If the lock cannot be granted, return
+  // DB_LOCK_NOTGRANTED, and populate the
+  //          given conflicts set with the txnids that hold conflicting locks in
+  //          the range. If the locktree cannot create more locks, return
+  //          TOKUDB_OUT_OF_LOCKS.
+  // note: Read locks cannot be shared between txnids, as one would expect.
+  //       This is for simplicity since read locks are rare in MySQL.
+  int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+                        txnid_set *conflicts, bool big_txn);
+
+  // effect: Attempts to grant a write lock for the range of keys between
+  // [left_key, right_key]. returns: If the lock cannot be granted, return
+  // DB_LOCK_NOTGRANTED, and populate the
+  //          given conflicts set with the txnids that hold conflicting locks in
+  //          the range. If the locktree cannot create more locks, return
+  //          TOKUDB_OUT_OF_LOCKS.
+  int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+                         txnid_set *conflicts, bool big_txn);
+
+  // effect: populate the conflicts set with the txnids that would preventing
+  //         the given txnid from getting a lock on [left_key, right_key]
+  void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key,
+                     const DBT *right_key, txnid_set *conflicts);
+
+  // effect: Release all of the lock ranges represented by the range buffer for
+  // a txnid.
+  void release_locks(TXNID txnid, const range_buffer *ranges,
+                     bool all_trx_locks_hint = false);
+
+  // effect: Runs escalation on this locktree
+  void escalate(lt_escalate_cb after_escalate_callback, void *extra);
+
+  // returns: The userdata associated with this locktree, or null if it has not
+  // been set.
+  void *get_userdata(void) const;
+
+  void set_userdata(void *userdata);
+
+  locktree_manager *get_manager(void) const;
+
+  void set_comparator(const comparator &cmp);
+
+  // Set the user-provided Lock Escalation Barrier check function and its
+  // argument
+  //
+  // Lock Escalation Barrier limits the scope of Lock Escalation.
+  // For two keys A and B (such that A < B),
+  // escalation_barrier_check_func(A, B)==true means that there's a lock
+  // escalation barrier between A and B, and lock escalation is not allowed to
+  // bridge the gap between A and B.
+  //
+  // This method sets the user-provided barrier check function and its
+  // parameter.
+  void set_escalation_barrier_func(lt_escalation_barrier_check_func func,
+                                   void *extra);
+
+  int compare(const locktree *lt) const;
+
+  DICTIONARY_ID get_dict_id() const;
+
+  // Private info struct for storing pending lock request state.
+  // Only to be used by lock requests. We store it here as
+  // something less opaque than usual to strike a tradeoff between
+  // abstraction and code complexity. It is still fairly abstract
+  // since the lock_request object is opaque
+  struct lt_lock_request_info *get_lock_request_info(void);
+
+  typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+                                TXNID txnid, bool is_shared,
+                                TxnidVector *owners);
+  void dump_locks(void *cdata, dump_callback cb);
+
+ private:
+  locktree_manager *m_mgr;
+  DICTIONARY_ID m_dict_id;
+  uint32_t m_reference_count;
+
+  // Since the memory referenced by this comparator is not owned by the
+  // locktree, the user must guarantee it will outlive the locktree.
+  //
+  // The ydb API accomplishes this by opening an ft_handle in the on_create
+  // callback, which will keep the underlying FT (and its descriptor) in memory
+  // for as long as the handle is open. The ft_handle is stored opaquely in the
+  // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
+  comparator m_cmp;
+
+  lt_escalation_barrier_check_func m_escalation_barrier;
+  void *m_escalation_barrier_arg;
+
+  concurrent_tree *m_rangetree;
+
+  void *m_userdata;
+  struct lt_lock_request_info m_lock_request_info;
+
+  // psergey-todo:
+  //  Each transaction also keeps a list of ranges it has locked.
+  //  So, when a transaction is running in STO mode, two identical
+  //  lists are kept: the STO lock list and transaction's owned locks
+  //  list. Why can't we do with just one list?
+
+  // The following fields and members prefixed with "sto_" are for
+  // the single txnid optimization, intended to speed up the case
+  // when only one transaction is using the locktree. If we know
+  // the locktree has only one transaction, then acquiring locks
+  // takes O(1) work and releasing all locks takes O(1) work.
+  //
+  // How do we know that the locktree only has a single txnid?
+  // What do we do if it does?
+  //
+  // When a txn with txnid T requests a lock:
+  // - If the tree is empty, the optimization is possible. Set the single
+  // txnid to T, and insert the lock range into the buffer.
+  // - If the tree is not empty, check if the single txnid is T. If so,
+  // append the lock range to the buffer. Otherwise, migrate all of
+  // the locks in the buffer into the rangetree on behalf of txnid T,
+  // and invalid the single txnid.
+  //
+  // When a txn with txnid T releases its locks:
+  // - If the single txnid is valid, it must be for T. Destroy the buffer.
+  // - If it's not valid, release locks the normal way in the rangetree.
+  //
+  // To carry out the optimization we need to record a single txnid
+  // and a range buffer for each locktree, each protected by the root
+  // lock of the locktree's rangetree. The root lock for a rangetree
+  // is grabbed by preparing a locked keyrange on the rangetree.
+  TXNID m_sto_txnid;
+  range_buffer m_sto_buffer;
+
+  // The single txnid optimization speeds up the case when only one
+  // transaction is using the locktree. But it has the potential to
+  // hurt the case when more than one txnid exists.
+  //
+  // There are two things we need to do to make the optimization only
+  // optimize the case we care about, and not hurt the general case.
+  //
+  // Bound the worst-case latency for lock migration when the
+  // optimization stops working:
+  // - Idea: Stop the optimization and migrate immediate if we notice
+  // the single txnid has takes many locks in the range buffer.
+  // - Implementation: Enforce a max size on the single txnid range buffer.
+  // - Analysis: Choosing the perfect max value, M, is difficult to do
+  // without some feedback from the field. Intuition tells us that M should
+  // not be so small that the optimization is worthless, and it should not
+  // be so big that it's unreasonable to have to wait behind a thread doing
+  // the work of converting M buffer locks into rangetree locks.
+  //
+  // Prevent concurrent-transaction workloads from trying the optimization
+  // in vain:
+  // - Idea: Don't even bother trying the optimization if we think the
+  // system is in a concurrent-transaction state.
+  // - Implementation: Do something even simpler than detecting whether the
+  // system is in a concurent-transaction state. Just keep a "score" value
+  // and some threshold. If at any time the locktree is eligible for the
+  // optimization, only do it if the score is at this threshold. When you
+  // actually do the optimization but someone has to migrate locks in the buffer
+  // (expensive), then reset the score back to zero. Each time a txn
+  // releases locks, the score is incremented by 1.
+  // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
+  // do the optimization in a concurrent-transaction system. Similarly, it
+  // takes at most C txns to start using the single txnid optimzation, which
+  // is good when the system transitions from multithreaded to single threaded.
+  //
+  // STO_BUFFER_MAX_SIZE:
+  //
+  // We choose the max value to be 1 million since most transactions are smaller
+  // than 1 million and we can create a rangetree of 1 million elements in
+  // less than a second. So we can be pretty confident that this threshold
+  // enables the optimization almost always, and prevents super pathological
+  // latency issues for the first lock taken by a second thread.
+  //
+  // STO_SCORE_THRESHOLD:
+  //
+  // A simple first guess at a good value for the score threshold is 100.
+  // By our analysis, we'd end up doing the optimization in vain for
+  // around 1% of all transactions, which seems reasonable. Further,
+  // if the system goes single threaded, it ought to be pretty quick
+  // for 100 transactions to go by, so we won't have to wait long before
+  // we start doing the single txind optimzation again.
+  static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
+  static const int STO_SCORE_THRESHOLD = 100;
+  int m_sto_score;
+
+  // statistics about time spent ending the STO early
+  uint64_t m_sto_end_early_count;
+  tokutime_t m_sto_end_early_time;
+
+  // effect: begins the single txnid optimizaiton, setting m_sto_txnid
+  //         to the given txnid.
+  // requires: m_sto_txnid is invalid
+  void sto_begin(TXNID txnid);
+
+  // effect: append a range to the sto buffer
+  // requires: m_sto_txnid is valid
+  void sto_append(const DBT *left_key, const DBT *right_key,
+                  bool is_write_request);
+
+  // effect: ends the single txnid optimization, releaseing any memory
+  //         stored in the sto buffer, notifying the tracker, and
+  //         invalidating m_sto_txnid.
+  // requires: m_sto_txnid is valid
+  void sto_end(void);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
+  // effect: ends the single txnid optimization early, migrating buffer locks
+  //         into the rangetree, calling sto_end(), and then setting the
+  //         sto_score back to zero.
+  // requires: m_sto_txnid is valid
+  void sto_end_early(void *prepared_lkr);
+  void sto_end_early_no_accounting(void *prepared_lkr);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. we can't
+  // use
+  //         the real type because the compiler won't allow us to forward
+  //         declare concurrent_tree::locked_keyrange without including
+  //         concurrent_tree.h, which we cannot do here because it is a template
+  //         implementation.
+  // requires: the prepared locked keyrange is for the locktree's rangetree
+  // requires: m_sto_txnid is valid
+  // effect: migrates each lock in the single txnid buffer into the locktree's
+  //         rangetree, notifying the memory tracker as necessary.
+  void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
+
+  // effect: If m_sto_txnid is valid, then release the txnid's locks
+  //         by ending the optimization.
+  // requires: If m_sto_txnid is valid, it is equal to the given txnid
+  // returns: True if locks were released for this txnid
+  bool sto_try_release(TXNID txnid);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
+  // requires: the prepared locked keyrange is for the locktree's rangetree
+  // effect: If m_sto_txnid is valid and equal to the given txnid, then
+  // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
+  //        but not equal to this txnid, then migrate the buffer's locks
+  //        into the rangetree and end the optimization, setting the score
+  //        back to zero.
+  // returns: true if the lock was acquired for this txnid
+  bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, bool is_write_request);
+
+  // Effect:
+  //  Provides a hook for a helgrind suppression.
+  // Returns:
+  //  true if m_sto_txnid is not TXNID_NONE
+  bool sto_txnid_is_valid_unsafe(void) const;
+
+  // Effect:
+  //  Provides a hook for a helgrind suppression.
+  // Returns:
+  //  m_sto_score
+  int sto_get_score_unsafe(void) const;
+
+  void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key,
+                                          const DBT *right_key);
+
+  int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+                                const DBT *left_key, const DBT *right_key,
+                                bool is_write_request, txnid_set *conflicts);
+
+  int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+                   const DBT *right_key, txnid_set *conflicts);
+
+  int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, txnid_set *conflicts,
+                       bool big_txn);
+
+  friend class locktree_unit_test;
+  friend class manager_unit_test;
+  friend class lock_request_unit_test;
+
+  // engine status reaches into the locktree to read some stats
+  friend void locktree_manager::get_status(LTM_STATUS status);
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
new file mode 100644
index 000000000..4186182be
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
@@ -0,0 +1,527 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "../portability/toku_pthread.h"
+#include "../util/status.h"
+#include "lock_request.h"
+#include "locktree.h"
+
+namespace toku {
+
+void locktree_manager::create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+                              lt_escalate_cb escalate_cb, void *escalate_extra,
+                              toku_external_mutex_factory_t mutex_factory_arg) {
+  mutex_factory = mutex_factory_arg;
+  m_max_lock_memory = DEFAULT_MAX_LOCK_MEMORY;
+  m_current_lock_memory = 0;
+
+  m_locktree_map.create();
+  m_lt_create_callback = create_cb;
+  m_lt_destroy_callback = destroy_cb;
+  m_lt_escalate_callback = escalate_cb;
+  m_lt_escalate_callback_extra = escalate_extra;
+  ZERO_STRUCT(m_mutex);
+  toku_mutex_init(manager_mutex_key, &m_mutex, nullptr);
+
+  ZERO_STRUCT(m_lt_counters);
+
+  escalator_init();
+}
+
+void locktree_manager::destroy(void) {
+  escalator_destroy();
+  invariant(m_current_lock_memory == 0);
+  invariant(m_locktree_map.size() == 0);
+  m_locktree_map.destroy();
+  toku_mutex_destroy(&m_mutex);
+}
+
+void locktree_manager::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void locktree_manager::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+size_t locktree_manager::get_max_lock_memory(void) { return m_max_lock_memory; }
+
+int locktree_manager::set_max_lock_memory(size_t max_lock_memory) {
+  int r = 0;
+  mutex_lock();
+  if (max_lock_memory < m_current_lock_memory) {
+    r = EDOM;
+  } else {
+    m_max_lock_memory = max_lock_memory;
+  }
+  mutex_unlock();
+  return r;
+}
+
+int locktree_manager::find_by_dict_id(locktree *const &lt,
+                                      const DICTIONARY_ID &dict_id) {
+  if (lt->get_dict_id().dictid < dict_id.dictid) {
+    return -1;
+  } else if (lt->get_dict_id().dictid == dict_id.dictid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+locktree *locktree_manager::locktree_map_find(const DICTIONARY_ID &dict_id) {
+  locktree *lt;
+  int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(dict_id, &lt,
+                                                                   nullptr);
+  return r == 0 ? lt : nullptr;
+}
+
+void locktree_manager::locktree_map_put(locktree *lt) {
+  int r = m_locktree_map.insert<DICTIONARY_ID, find_by_dict_id>(
+      lt, lt->get_dict_id(), nullptr);
+  invariant_zero(r);
+}
+
+void locktree_manager::locktree_map_remove(locktree *lt) {
+  uint32_t idx;
+  locktree *found_lt;
+  int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(
+      lt->get_dict_id(), &found_lt, &idx);
+  invariant_zero(r);
+  invariant(found_lt == lt);
+  r = m_locktree_map.delete_at(idx);
+  invariant_zero(r);
+}
+
+locktree *locktree_manager::get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+                                   void *on_create_extra) {
+  // hold the mutex around searching and maybe
+  // inserting into the locktree map
+  mutex_lock();
+
+  locktree *lt = locktree_map_find(dict_id);
+  if (lt == nullptr) {
+    XCALLOC(lt);
+    lt->create(this, dict_id, cmp, mutex_factory);
+
+    // new locktree created - call the on_create callback
+    // and put it in the locktree map
+    if (m_lt_create_callback) {
+      int r = m_lt_create_callback(lt, on_create_extra);
+      if (r != 0) {
+        lt->release_reference();
+        lt->destroy();
+        toku_free(lt);
+        lt = nullptr;
+      }
+    }
+    if (lt) {
+      locktree_map_put(lt);
+    }
+  } else {
+    reference_lt(lt);
+  }
+
+  mutex_unlock();
+
+  return lt;
+}
+
+void locktree_manager::reference_lt(locktree *lt) {
+  // increment using a sync fetch and add.
+  // the caller guarantees that the lt won't be
+  // destroyed while we increment the count here.
+  //
+  // the caller can do this by already having an lt
+  // reference or by holding the manager mutex.
+  //
+  // if the manager's mutex is held, it is ok for the
+  // reference count to transition from 0 to 1 (no race),
+  // since we're serialized with other opens and closes.
+  lt->add_reference();
+}
+
+void locktree_manager::release_lt(locktree *lt) {
+  bool do_destroy = false;
+  DICTIONARY_ID dict_id = lt->get_dict_id();
+
+  // Release a reference on the locktree. If the count transitions to zero,
+  // then we *may* need to do the cleanup.
+  //
+  // Grab the manager's mutex and look for a locktree with this locktree's
+  // dictionary id. Since dictionary id's never get reused, any locktree
+  // found must be the one we just released a reference on.
+  //
+  // At least two things could have happened since we got the mutex:
+  // - Another thread gets a locktree with the same dict_id, increments
+  // the reference count. In this case, we shouldn't destroy it.
+  // - Another thread gets a locktree with the same dict_id and then
+  // releases it quickly, transitioning the reference count from zero to
+  // one and back to zero. In this case, only one of us should destroy it.
+  // It doesn't matter which. We originally missed this case, see #5776.
+  //
+  // After 5776, the high level rule for release is described below.
+  //
+  // If a thread releases a locktree and notices the reference count transition
+  // to zero, then that thread must immediately:
+  // - assume the locktree object is invalid
+  // - grab the manager's mutex
+  // - search the locktree map for a locktree with the same dict_id and remove
+  // it, if it exists. the destroy may be deferred.
+  // - release the manager's mutex
+  //
+  // This way, if many threads transition the same locktree's reference count
+  // from 1 to zero and wait behind the manager's mutex, only one of them will
+  // do the actual destroy and the others will happily do nothing.
+  uint32_t refs = lt->release_reference();
+  if (refs == 0) {
+    mutex_lock();
+    // lt may not have already been destroyed, so look it up.
+    locktree *find_lt = locktree_map_find(dict_id);
+    if (find_lt != nullptr) {
+      // A locktree is still in the map with that dict_id, so it must be
+      // equal to lt. This is true because dictionary ids are never reused.
+      // If the reference count is zero, it's our responsibility to remove
+      // it and do the destroy. Otherwise, someone still wants it.
+      // If the locktree is still valid then check if it should be deleted.
+      if (find_lt == lt) {
+        if (lt->get_reference_count() == 0) {
+          locktree_map_remove(lt);
+          do_destroy = true;
+        }
+        m_lt_counters.add(lt->get_lock_request_info()->counters);
+      }
+    }
+    mutex_unlock();
+  }
+
+  // if necessary, do the destroy without holding the mutex
+  if (do_destroy) {
+    if (m_lt_destroy_callback) {
+      m_lt_destroy_callback(lt);
+    }
+    lt->destroy();
+    toku_free(lt);
+  }
+}
+
+void locktree_manager::run_escalation(void) {
+  struct escalation_fn {
+    static void run(void *extra) {
+      locktree_manager *mgr = (locktree_manager *)extra;
+      mgr->escalate_all_locktrees();
+    };
+  };
+  m_escalator.run(this, escalation_fn::run, this);
+}
+
+// test-only version of lock escalation
+void locktree_manager::run_escalation_for_test(void) { run_escalation(); }
+
+void locktree_manager::escalate_all_locktrees(void) {
+  uint64_t t0 = toku_current_time_microsec();
+
+  // get all locktrees
+  mutex_lock();
+  int num_locktrees = m_locktree_map.size();
+  locktree **locktrees = new locktree *[num_locktrees];
+  for (int i = 0; i < num_locktrees; i++) {
+    int r = m_locktree_map.fetch(i, &locktrees[i]);
+    invariant_zero(r);
+    reference_lt(locktrees[i]);
+  }
+  mutex_unlock();
+
+  // escalate them
+  escalate_locktrees(locktrees, num_locktrees);
+
+  delete[] locktrees;
+
+  uint64_t t1 = toku_current_time_microsec();
+  add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::note_mem_used(uint64_t mem_used) {
+  (void)toku_sync_fetch_and_add(&m_current_lock_memory, mem_used);
+}
+
+void locktree_manager::note_mem_released(uint64_t mem_released) {
+  uint64_t old_mem_used =
+      toku_sync_fetch_and_sub(&m_current_lock_memory, mem_released);
+  invariant(old_mem_used >= mem_released);
+}
+
+bool locktree_manager::out_of_locks(void) const {
+  return m_current_lock_memory >= m_max_lock_memory;
+}
+
+bool locktree_manager::over_big_threshold(void) {
+  return m_current_lock_memory >= m_max_lock_memory / 2;
+}
+
+int locktree_manager::iterate_pending_lock_requests(
+    lock_request_iterate_callback callback, void *extra) {
+  mutex_lock();
+  int r = 0;
+  uint32_t num_locktrees = m_locktree_map.size();
+  for (uint32_t i = 0; i < num_locktrees && r == 0; i++) {
+    locktree *lt;
+    r = m_locktree_map.fetch(i, &lt);
+    invariant_zero(r);
+    if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+      continue;
+
+    struct lt_lock_request_info *info = lt->get_lock_request_info();
+    toku_external_mutex_lock(&info->mutex);
+
+    uint32_t num_requests = info->pending_lock_requests.size();
+    for (uint32_t k = 0; k < num_requests && r == 0; k++) {
+      lock_request *req;
+      r = info->pending_lock_requests.fetch(k, &req);
+      invariant_zero(r);
+      if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+        continue;
+      r = callback(lt->get_dict_id(), req->get_txnid(), req->get_left_key(),
+                   req->get_right_key(), req->get_conflicting_txnid(),
+                   req->get_start_time(), extra);
+    }
+
+    toku_external_mutex_unlock(&info->mutex);
+  }
+  mutex_unlock();
+  return r;
+}
+
+int locktree_manager::check_current_lock_constraints(bool big_txn) {
+  int r = 0;
+  if (big_txn && over_big_threshold()) {
+    run_escalation();
+    if (over_big_threshold()) {
+      r = TOKUDB_OUT_OF_LOCKS;
+    }
+  }
+  if (r == 0 && out_of_locks()) {
+    run_escalation();
+    if (out_of_locks()) {
+      // return an error if we're still out of locks after escalation.
+      r = TOKUDB_OUT_OF_LOCKS;
+    }
+  }
+  return r;
+}
+
+void locktree_manager::escalator_init(void) {
+  ZERO_STRUCT(m_escalation_mutex);
+  toku_mutex_init(manager_escalation_mutex_key, &m_escalation_mutex, nullptr);
+  m_escalation_count = 0;
+  m_escalation_time = 0;
+  m_wait_escalation_count = 0;
+  m_wait_escalation_time = 0;
+  m_long_wait_escalation_count = 0;
+  m_long_wait_escalation_time = 0;
+  m_escalation_latest_result = 0;
+  m_escalator.create();
+}
+
+void locktree_manager::escalator_destroy(void) {
+  m_escalator.destroy();
+  toku_mutex_destroy(&m_escalation_mutex);
+}
+
+void locktree_manager::add_escalator_wait_time(uint64_t t) {
+  toku_mutex_lock(&m_escalation_mutex);
+  m_wait_escalation_count += 1;
+  m_wait_escalation_time += t;
+  if (t >= 1000000) {
+    m_long_wait_escalation_count += 1;
+    m_long_wait_escalation_time += t;
+  }
+  toku_mutex_unlock(&m_escalation_mutex);
+}
+
+void locktree_manager::escalate_locktrees(locktree **locktrees,
+                                          int num_locktrees) {
+  // there are too many row locks in the system and we need to tidy up.
+  //
+  // a simple implementation of escalation does not attempt
+  // to reduce the memory foot print of each txn's range buffer.
+  // doing so would require some layering hackery (or a callback)
+  // and more complicated locking. for now, just escalate each
+  // locktree individually, in-place.
+  tokutime_t t0 = toku_time_now();
+  for (int i = 0; i < num_locktrees; i++) {
+    locktrees[i]->escalate(m_lt_escalate_callback,
+                           m_lt_escalate_callback_extra);
+    release_lt(locktrees[i]);
+  }
+  tokutime_t t1 = toku_time_now();
+
+  toku_mutex_lock(&m_escalation_mutex);
+  m_escalation_count++;
+  m_escalation_time += (t1 - t0);
+  m_escalation_latest_result = m_current_lock_memory;
+  toku_mutex_unlock(&m_escalation_mutex);
+}
+
+struct escalate_args {
+  locktree_manager *mgr;
+  locktree **locktrees;
+  int num_locktrees;
+};
+
+void locktree_manager::locktree_escalator::create(void) {
+  ZERO_STRUCT(m_escalator_mutex);
+  toku_mutex_init(manager_escalator_mutex_key, &m_escalator_mutex, nullptr);
+  toku_cond_init(manager_m_escalator_done_key, &m_escalator_done, nullptr);
+  m_escalator_running = false;
+}
+
+void locktree_manager::locktree_escalator::destroy(void) {
+  toku_cond_destroy(&m_escalator_done);
+  toku_mutex_destroy(&m_escalator_mutex);
+}
+
+void locktree_manager::locktree_escalator::run(
+    locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+    void *extra) {
+  uint64_t t0 = toku_current_time_microsec();
+  toku_mutex_lock(&m_escalator_mutex);
+  if (!m_escalator_running) {
+    // run escalation on this thread
+    m_escalator_running = true;
+    toku_mutex_unlock(&m_escalator_mutex);
+    escalate_locktrees_fun(extra);
+    toku_mutex_lock(&m_escalator_mutex);
+    m_escalator_running = false;
+    toku_cond_broadcast(&m_escalator_done);
+  } else {
+    toku_cond_wait(&m_escalator_done, &m_escalator_mutex);
+  }
+  toku_mutex_unlock(&m_escalator_mutex);
+  uint64_t t1 = toku_current_time_microsec();
+  mgr->add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::get_status(LTM_STATUS statp) {
+  ltm_status.init();
+  LTM_STATUS_VAL(LTM_SIZE_CURRENT) = m_current_lock_memory;
+  LTM_STATUS_VAL(LTM_SIZE_LIMIT) = m_max_lock_memory;
+  LTM_STATUS_VAL(LTM_ESCALATION_COUNT) = m_escalation_count;
+  LTM_STATUS_VAL(LTM_ESCALATION_TIME) = m_escalation_time;
+  LTM_STATUS_VAL(LTM_ESCALATION_LATEST_RESULT) = m_escalation_latest_result;
+  LTM_STATUS_VAL(LTM_WAIT_ESCALATION_COUNT) = m_wait_escalation_count;
+  LTM_STATUS_VAL(LTM_WAIT_ESCALATION_TIME) = m_wait_escalation_time;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_COUNT) = m_long_wait_escalation_count;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_TIME) = m_long_wait_escalation_time;
+
+  uint64_t lock_requests_pending = 0;
+  uint64_t sto_num_eligible = 0;
+  uint64_t sto_end_early_count = 0;
+  tokutime_t sto_end_early_time = 0;
+  uint32_t num_locktrees = 0;
+  struct lt_counters lt_counters;
+  ZERO_STRUCT(lt_counters);  // PORT: instead of ={}.
+
+  if (toku_mutex_trylock(&m_mutex) == 0) {
+    lt_counters = m_lt_counters;
+    num_locktrees = m_locktree_map.size();
+    for (uint32_t i = 0; i < num_locktrees; i++) {
+      locktree *lt;
+      int r = m_locktree_map.fetch(i, &lt);
+      invariant_zero(r);
+      if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+        continue;
+      if (toku_external_mutex_trylock(&lt->m_lock_request_info.mutex) == 0) {
+        lock_requests_pending +=
+            lt->m_lock_request_info.pending_lock_requests.size();
+        lt_counters.add(lt->get_lock_request_info()->counters);
+        toku_external_mutex_unlock(&lt->m_lock_request_info.mutex);
+      }
+      sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0;
+      sto_end_early_count += lt->m_sto_end_early_count;
+      sto_end_early_time += lt->m_sto_end_early_time;
+    }
+    mutex_unlock();
+  }
+
+  LTM_STATUS_VAL(LTM_NUM_LOCKTREES) = num_locktrees;
+  LTM_STATUS_VAL(LTM_LOCK_REQUESTS_PENDING) = lock_requests_pending;
+  LTM_STATUS_VAL(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible;
+  LTM_STATUS_VAL(LTM_STO_END_EARLY_COUNT) = sto_end_early_count;
+  LTM_STATUS_VAL(LTM_STO_END_EARLY_TIME) = sto_end_early_time;
+  LTM_STATUS_VAL(LTM_WAIT_COUNT) = lt_counters.wait_count;
+  LTM_STATUS_VAL(LTM_WAIT_TIME) = lt_counters.wait_time;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_COUNT) = lt_counters.long_wait_count;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_TIME) = lt_counters.long_wait_time;
+  LTM_STATUS_VAL(LTM_TIMEOUT_COUNT) = lt_counters.timeout_count;
+  *statp = ltm_status;
+}
+
+void locktree_manager::kill_waiter(void *extra) {
+  mutex_lock();
+  int r = 0;
+  uint32_t num_locktrees = m_locktree_map.size();
+  for (uint32_t i = 0; i < num_locktrees; i++) {
+    locktree *lt;
+    r = m_locktree_map.fetch(i, &lt);
+    invariant_zero(r);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    lock_request::kill_waiter(lt, extra);
+  }
+  mutex_unlock();
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
new file mode 100644
index 000000000..1e1d23ef8
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
@@ -0,0 +1,265 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "range_buffer.h"
+
+#include <string.h>
+
+#include "../portability/memory.h"
+#include "../util/dbt.h"
+
+namespace toku {
+
+bool range_buffer::record_header::left_is_infinite(void) const {
+  return left_neg_inf || left_pos_inf;
+}
+
+bool range_buffer::record_header::right_is_infinite(void) const {
+  return right_neg_inf || right_pos_inf;
+}
+
+void range_buffer::record_header::init(const DBT *left_key,
+                                       const DBT *right_key,
+                                       bool is_exclusive) {
+  is_exclusive_lock = is_exclusive;
+  left_neg_inf = left_key == toku_dbt_negative_infinity();
+  left_pos_inf = left_key == toku_dbt_positive_infinity();
+  left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size;
+  if (right_key) {
+    right_neg_inf = right_key == toku_dbt_negative_infinity();
+    right_pos_inf = right_key == toku_dbt_positive_infinity();
+    right_key_size = toku_dbt_is_infinite(right_key) ? 0 : right_key->size;
+  } else {
+    right_neg_inf = left_neg_inf;
+    right_pos_inf = left_pos_inf;
+    right_key_size = 0;
+  }
+}
+
+const DBT *range_buffer::iterator::record::get_left_key(void) const {
+  if (_header.left_neg_inf) {
+    return toku_dbt_negative_infinity();
+  } else if (_header.left_pos_inf) {
+    return toku_dbt_positive_infinity();
+  } else {
+    return &_left_key;
+  }
+}
+
+const DBT *range_buffer::iterator::record::get_right_key(void) const {
+  if (_header.right_neg_inf) {
+    return toku_dbt_negative_infinity();
+  } else if (_header.right_pos_inf) {
+    return toku_dbt_positive_infinity();
+  } else {
+    return &_right_key;
+  }
+}
+
+size_t range_buffer::iterator::record::size(void) const {
+  return sizeof(record_header) + _header.left_key_size + _header.right_key_size;
+}
+
+void range_buffer::iterator::record::deserialize(const char *buf) {
+  size_t current = 0;
+
+  // deserialize the header
+  memcpy(&_header, buf, sizeof(record_header));
+  current += sizeof(record_header);
+
+  // deserialize the left key if necessary
+  if (!_header.left_is_infinite()) {
+    // point the left DBT's buffer into ours
+    toku_fill_dbt(&_left_key, buf + current, _header.left_key_size);
+    current += _header.left_key_size;
+  }
+
+  // deserialize the right key if necessary
+  if (!_header.right_is_infinite()) {
+    if (_header.right_key_size == 0) {
+      toku_copyref_dbt(&_right_key, _left_key);
+    } else {
+      toku_fill_dbt(&_right_key, buf + current, _header.right_key_size);
+    }
+  }
+}
+
+toku::range_buffer::iterator::iterator()
+    : _ma_chunk_iterator(nullptr),
+      _current_chunk_base(nullptr),
+      _current_chunk_offset(0),
+      _current_chunk_max(0),
+      _current_rec_size(0) {}
+
+toku::range_buffer::iterator::iterator(const range_buffer *buffer)
+    : _ma_chunk_iterator(&buffer->_arena),
+      _current_chunk_base(nullptr),
+      _current_chunk_offset(0),
+      _current_chunk_max(0),
+      _current_rec_size(0) {
+  reset_current_chunk();
+}
+
+void range_buffer::iterator::reset_current_chunk() {
+  _current_chunk_base = _ma_chunk_iterator.current(&_current_chunk_max);
+  _current_chunk_offset = 0;
+}
+
+bool range_buffer::iterator::current(record *rec) {
+  if (_current_chunk_offset < _current_chunk_max) {
+    const char *buf = reinterpret_cast<const char *>(_current_chunk_base);
+    rec->deserialize(buf + _current_chunk_offset);
+    _current_rec_size = rec->size();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// move the iterator to the next record in the buffer
+void range_buffer::iterator::next(void) {
+  invariant(_current_chunk_offset < _current_chunk_max);
+  invariant(_current_rec_size > 0);
+
+  // the next record is _current_rec_size bytes forward
+  _current_chunk_offset += _current_rec_size;
+  // now, we don't know how big the current is, set it to 0.
+  _current_rec_size = 0;
+
+  if (_current_chunk_offset >= _current_chunk_max) {
+    // current chunk is exhausted, try moving to the next one
+    if (_ma_chunk_iterator.more()) {
+      _ma_chunk_iterator.next();
+      reset_current_chunk();
+    }
+  }
+}
+
+void range_buffer::create(void) {
+  // allocate buffer space lazily instead of on creation. this way,
+  // no malloc/free is done if the transaction ends up taking no locks.
+  _arena.create(0);
+  _num_ranges = 0;
+}
+
+void range_buffer::append(const DBT *left_key, const DBT *right_key,
+                          bool is_write_request) {
+  // if the keys are equal, then only one copy is stored.
+  if (toku_dbt_equals(left_key, right_key)) {
+    invariant(left_key->size <= MAX_KEY_SIZE);
+    append_point(left_key, is_write_request);
+  } else {
+    invariant(left_key->size <= MAX_KEY_SIZE);
+    invariant(right_key->size <= MAX_KEY_SIZE);
+    append_range(left_key, right_key, is_write_request);
+  }
+  _num_ranges++;
+}
+
+bool range_buffer::is_empty(void) const { return total_memory_size() == 0; }
+
+uint64_t range_buffer::total_memory_size(void) const {
+  return _arena.total_size_in_use();
+}
+
+int range_buffer::get_num_ranges(void) const { return _num_ranges; }
+
+void range_buffer::destroy(void) { _arena.destroy(); }
+
+void range_buffer::append_range(const DBT *left_key, const DBT *right_key,
+                                bool is_exclusive) {
+  size_t record_length =
+      sizeof(record_header) + left_key->size + right_key->size;
+  char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+  record_header h;
+  h.init(left_key, right_key, is_exclusive);
+
+  // serialize the header
+  memcpy(buf, &h, sizeof(record_header));
+  buf += sizeof(record_header);
+
+  // serialize the left key if necessary
+  if (!h.left_is_infinite()) {
+    memcpy(buf, left_key->data, left_key->size);
+    buf += left_key->size;
+  }
+
+  // serialize the right key if necessary
+  if (!h.right_is_infinite()) {
+    memcpy(buf, right_key->data, right_key->size);
+  }
+}
+
+void range_buffer::append_point(const DBT *key, bool is_exclusive) {
+  size_t record_length = sizeof(record_header) + key->size;
+  char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+  record_header h;
+  h.init(key, nullptr, is_exclusive);
+
+  // serialize the header
+  memcpy(buf, &h, sizeof(record_header));
+  buf += sizeof(record_header);
+
+  // serialize the key if necessary
+  if (!h.left_is_infinite()) {
+    memcpy(buf, key->data, key->size);
+  }
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
new file mode 100644
index 000000000..76e28d747
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
@@ -0,0 +1,178 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <inttypes.h>
+#include <stdint.h>
+
+#include "../util/dbt.h"
+#include "../util/memarena.h"
+
+namespace toku {
+
+// a key range buffer represents a set of key ranges that can
+// be stored, iterated over, and then destroyed all at once.
+class range_buffer {
+ private:
+  // the key range buffer is a bunch of records in a row.
+  // each record has the following header, followed by the
+  // left key and right key data payload, if applicable.
+  // we limit keys to be 2^16, since we store lengths as 2 bytes.
+  static const size_t MAX_KEY_SIZE = 1 << 16;
+
+  struct record_header {
+    bool left_neg_inf;
+    bool left_pos_inf;
+    bool right_pos_inf;
+    bool right_neg_inf;
+    uint16_t left_key_size;
+    uint16_t right_key_size;
+    bool is_exclusive_lock;
+
+    bool left_is_infinite(void) const;
+
+    bool right_is_infinite(void) const;
+
+    void init(const DBT *left_key, const DBT *right_key, bool is_exclusive);
+  };
+  // PORT static_assert(sizeof(record_header) == 8, "record header format is
+  // off");
+
+ public:
+  // the iterator abstracts reading over a buffer of variable length
+  // records one by one until there are no more left.
+  class iterator {
+   public:
+    iterator();
+    iterator(const range_buffer *buffer);
+
+    // a record represents the user-view of a serialized key range.
+    // it handles positive and negative infinity and the optimized
+    // point range case, where left and right points share memory.
+    class record {
+     public:
+      // get a read-only pointer to the left key of this record's range
+      const DBT *get_left_key(void) const;
+
+      // get a read-only pointer to the right key of this record's range
+      const DBT *get_right_key(void) const;
+
+      // how big is this record? this tells us where the next record is
+      size_t size(void) const;
+
+      bool get_exclusive_flag() const { return _header.is_exclusive_lock; }
+
+      // populate a record header and point our DBT's
+      // buffers into ours if they are not infinite.
+      void deserialize(const char *buf);
+
+     private:
+      record_header _header;
+      DBT _left_key;
+      DBT _right_key;
+    };
+
+    // populate the given record object with the current
+    // the memory referred to by record is valid for only
+    // as long as the record exists.
+    bool current(record *rec);
+
+    // move the iterator to the next record in the buffer
+    void next(void);
+
+   private:
+    void reset_current_chunk();
+
+    // the key range buffer we are iterating over, the current
+    // offset in that buffer, and the size of the current record.
+    memarena::chunk_iterator _ma_chunk_iterator;
+    const void *_current_chunk_base;
+    size_t _current_chunk_offset;
+    size_t _current_chunk_max;
+    size_t _current_rec_size;
+  };
+
+  // allocate buffer space lazily instead of on creation. this way,
+  // no malloc/free is done if the transaction ends up taking no locks.
+  void create(void);
+
+  // append a left/right key range to the buffer.
+  // if the keys are equal, then only one copy is stored.
+  void append(const DBT *left_key, const DBT *right_key,
+              bool is_write_request = false);
+
+  // is this range buffer empty?
+  bool is_empty(void) const;
+
+  // how much memory is being used by this range buffer?
+  uint64_t total_memory_size(void) const;
+
+  // how many ranges are stored in this range buffer?
+  int get_num_ranges(void) const;
+
+  void destroy(void);
+
+ private:
+  memarena _arena;
+  int _num_ranges;
+
+  void append_range(const DBT *left_key, const DBT *right_key,
+                    bool is_write_request);
+
+  // append a point to the buffer. this is the space/time saving
+  // optimization for key ranges where left == right.
+  void append_point(const DBT *key, bool is_write_request);
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
new file mode 100644
index 000000000..8997f634b
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
@@ -0,0 +1,520 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "treenode.h"
+
+#include "../portability/toku_race_tools.h"
+
+namespace toku {
+
+// TODO: source location info might have to be pulled up one caller
+// to be useful
+void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void treenode::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+void treenode::init(const comparator *cmp) {
+  m_txnid = TXNID_NONE;
+  m_is_root = false;
+  m_is_empty = true;
+  m_cmp = cmp;
+
+  m_is_shared = false;
+  m_owners = nullptr;
+
+  // use an adaptive mutex at each node since we expect the time the
+  // lock is held to be relatively short compared to a context switch.
+  // indeed, this improves performance at high thread counts considerably.
+  memset(&m_mutex, 0, sizeof(toku_mutex_t));
+  toku_pthread_mutexattr_t attr;
+  toku_mutexattr_init(&attr);
+  toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE);
+  toku_mutex_init(treenode_mutex_key, &m_mutex, &attr);
+  toku_mutexattr_destroy(&attr);
+  m_left_child.set(nullptr);
+  m_right_child.set(nullptr);
+}
+
+void treenode::create_root(const comparator *cmp) {
+  init(cmp);
+  m_is_root = true;
+}
+
+void treenode::destroy_root(void) {
+  invariant(is_root());
+  invariant(is_empty());
+  toku_mutex_destroy(&m_mutex);
+  m_cmp = nullptr;
+}
+
+void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid,
+                                   bool is_shared) {
+  // allocates a new copy of the range for this node
+  m_range.create_copy(range);
+  m_txnid = txnid;
+  m_is_shared = is_shared;
+  m_is_empty = false;
+}
+
+bool treenode::is_root(void) { return m_is_root; }
+
+bool treenode::is_empty(void) { return m_is_empty; }
+
+bool treenode::range_overlaps(const keyrange &range) {
+  return m_range.overlaps(*m_cmp, range);
+}
+
+treenode *treenode::alloc(const comparator *cmp, const keyrange &range,
+                          TXNID txnid, bool is_shared) {
+  treenode *XCALLOC(node);
+  node->init(cmp);
+  node->set_range_and_txnid(range, txnid, is_shared);
+  return node;
+}
+
+void treenode::swap_in_place(treenode *node1, treenode *node2) {
+  keyrange tmp_range = node1->m_range;
+  TXNID tmp_txnid = node1->m_txnid;
+  node1->m_range = node2->m_range;
+  node1->m_txnid = node2->m_txnid;
+  node2->m_range = tmp_range;
+  node2->m_txnid = tmp_txnid;
+
+  bool tmp_is_shared = node1->m_is_shared;
+  node1->m_is_shared = node2->m_is_shared;
+  node2->m_is_shared = tmp_is_shared;
+
+  auto tmp_m_owners = node1->m_owners;
+  node1->m_owners = node2->m_owners;
+  node2->m_owners = tmp_m_owners;
+}
+
+bool treenode::add_shared_owner(TXNID txnid) {
+  assert(m_is_shared);
+  if (txnid == m_txnid)
+    return false;  // acquiring a lock on the same range by the same trx
+
+  if (m_txnid != TXNID_SHARED) {
+    m_owners = new TxnidVector;
+    m_owners->insert(m_txnid);
+    m_txnid = TXNID_SHARED;
+  }
+  m_owners->insert(txnid);
+  return true;
+}
+
+void treenode::free(treenode *node) {
+  // destroy the range, freeing any copied keys
+  node->m_range.destroy();
+
+  if (node->m_owners) {
+    delete node->m_owners;
+    node->m_owners = nullptr;  // need this?
+  }
+
+  // the root is simply marked as empty.
+  if (node->is_root()) {
+    // PORT toku_mutex_assert_locked(&node->m_mutex);
+    node->m_is_empty = true;
+  } else {
+    // PORT toku_mutex_assert_unlocked(&node->m_mutex);
+    toku_mutex_destroy(&node->m_mutex);
+    toku_free(node);
+  }
+}
+
+uint32_t treenode::get_depth_estimate(void) const {
+  const uint32_t left_est = m_left_child.depth_est;
+  const uint32_t right_est = m_right_child.depth_est;
+  return (left_est > right_est ? left_est : right_est) + 1;
+}
+
+treenode *treenode::find_node_with_overlapping_child(
+    const keyrange &range, const keyrange::comparison *cmp_hint) {
+  // determine which child to look at based on a comparison. if we were
+  // given a comparison hint, use that. otherwise, compare them now.
+  keyrange::comparison c =
+      cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range);
+
+  treenode *child;
+  if (c == keyrange::comparison::LESS_THAN) {
+    child = lock_and_rebalance_left();
+  } else {
+    // The caller (locked_keyrange::acquire) handles the case where
+    // the root of the locked_keyrange is the node that overlaps.
+    // range is guaranteed not to overlap this node.
+    invariant(c == keyrange::comparison::GREATER_THAN);
+    child = lock_and_rebalance_right();
+  }
+
+  // if the search would lead us to an empty subtree (child == nullptr),
+  // or the child overlaps, then we know this node is the parent we want.
+  // otherwise we need to recur into that child.
+  if (child == nullptr) {
+    return this;
+  } else {
+    c = range.compare(*m_cmp, child->m_range);
+    if (c == keyrange::comparison::EQUALS ||
+        c == keyrange::comparison::OVERLAPS) {
+      child->mutex_unlock();
+      return this;
+    } else {
+      // unlock this node before recurring into the locked child,
+      // passing in a comparison hint since we just comapred range
+      // to the child's range.
+      mutex_unlock();
+      return child->find_node_with_overlapping_child(range, &c);
+    }
+  }
+}
+
+bool treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) {
+  int rc = true;
+  // choose a child to check. if that child is null, then insert the new node
+  // there. otherwise recur down that child's subtree
+  keyrange::comparison c = range.compare(*m_cmp, m_range);
+  if (c == keyrange::comparison::LESS_THAN) {
+    treenode *left_child = lock_and_rebalance_left();
+    if (left_child == nullptr) {
+      left_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+      m_left_child.set(left_child);
+    } else {
+      left_child->insert(range, txnid, is_shared);
+      left_child->mutex_unlock();
+    }
+  } else if (c == keyrange::comparison::GREATER_THAN) {
+    // invariant(c == keyrange::comparison::GREATER_THAN);
+    treenode *right_child = lock_and_rebalance_right();
+    if (right_child == nullptr) {
+      right_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+      m_right_child.set(right_child);
+    } else {
+      right_child->insert(range, txnid, is_shared);
+      right_child->mutex_unlock();
+    }
+  } else if (c == keyrange::comparison::EQUALS) {
+    invariant(is_shared);
+    invariant(m_is_shared);
+    rc = add_shared_owner(txnid);
+  } else {
+    invariant(0);
+  }
+  return rc;
+}
+
+treenode *treenode::find_child_at_extreme(int direction, treenode **parent) {
+  treenode *child =
+      direction > 0 ? m_right_child.get_locked() : m_left_child.get_locked();
+
+  if (child) {
+    *parent = this;
+    treenode *child_extreme = child->find_child_at_extreme(direction, parent);
+    child->mutex_unlock();
+    return child_extreme;
+  } else {
+    return this;
+  }
+}
+
+treenode *treenode::find_leftmost_child(treenode **parent) {
+  return find_child_at_extreme(-1, parent);
+}
+
+treenode *treenode::find_rightmost_child(treenode **parent) {
+  return find_child_at_extreme(1, parent);
+}
+
+treenode *treenode::remove_root_of_subtree() {
+  // if this node has no children, just free it and return null
+  if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) {
+    // treenode::free requires that non-root nodes are unlocked
+    if (!is_root()) {
+      mutex_unlock();
+    }
+    treenode::free(this);
+    return nullptr;
+  }
+
+  // we have a child, so get either the in-order successor or
+  // predecessor of this node to be our replacement.
+  // replacement_parent is updated by the find functions as
+  // they recur down the tree, so initialize it to this.
+  treenode *child, *replacement;
+  treenode *replacement_parent = this;
+  if (m_left_child.ptr != nullptr) {
+    child = m_left_child.get_locked();
+    replacement = child->find_rightmost_child(&replacement_parent);
+    invariant(replacement == child || replacement_parent != this);
+
+    // detach the replacement from its parent
+    if (replacement_parent == this) {
+      m_left_child = replacement->m_left_child;
+    } else {
+      replacement_parent->m_right_child = replacement->m_left_child;
+    }
+  } else {
+    child = m_right_child.get_locked();
+    replacement = child->find_leftmost_child(&replacement_parent);
+    invariant(replacement == child || replacement_parent != this);
+
+    // detach the replacement from its parent
+    if (replacement_parent == this) {
+      m_right_child = replacement->m_right_child;
+    } else {
+      replacement_parent->m_left_child = replacement->m_right_child;
+    }
+  }
+  child->mutex_unlock();
+
+  // swap in place with the detached replacement, then destroy it
+  treenode::swap_in_place(replacement, this);
+  treenode::free(replacement);
+
+  return this;
+}
+
+void treenode::recursive_remove(void) {
+  treenode *left = m_left_child.ptr;
+  if (left) {
+    left->recursive_remove();
+  }
+  m_left_child.set(nullptr);
+
+  treenode *right = m_right_child.ptr;
+  if (right) {
+    right->recursive_remove();
+  }
+  m_right_child.set(nullptr);
+
+  // we do not take locks on the way down, so we know non-root nodes
+  // are unlocked here and the caller is required to pass a locked
+  // root, so this free is correct.
+  treenode::free(this);
+}
+
+void treenode::remove_shared_owner(TXNID txnid) {
+  assert(m_owners->size() > 1);
+  m_owners->erase(txnid);
+  assert(m_owners->size() > 0);
+  /* if there is just one owner left, move it to m_txnid */
+  if (m_owners->size() == 1) {
+    m_txnid = *m_owners->begin();
+    delete m_owners;
+    m_owners = nullptr;
+  }
+}
+
+treenode *treenode::remove(const keyrange &range, TXNID txnid) {
+  treenode *child;
+  // if the range is equal to this node's range, then just remove
+  // the root of this subtree. otherwise search down the tree
+  // in either the left or right children.
+  keyrange::comparison c = range.compare(*m_cmp, m_range);
+  switch (c) {
+    case keyrange::comparison::EQUALS: {
+      // if we are the only owners, remove. Otherwise, just remove
+      // us from the owners list.
+      if (txnid != TXNID_ANY && has_multiple_owners()) {
+        remove_shared_owner(txnid);
+        return this;
+      } else {
+        return remove_root_of_subtree();
+      }
+    }
+    case keyrange::comparison::LESS_THAN:
+      child = m_left_child.get_locked();
+      invariant_notnull(child);
+      child = child->remove(range, txnid);
+
+      // unlock the child if there still is one.
+      // regardless, set the right child pointer
+      if (child) {
+        child->mutex_unlock();
+      }
+      m_left_child.set(child);
+      break;
+    case keyrange::comparison::GREATER_THAN:
+      child = m_right_child.get_locked();
+      invariant_notnull(child);
+      child = child->remove(range, txnid);
+
+      // unlock the child if there still is one.
+      // regardless, set the right child pointer
+      if (child) {
+        child->mutex_unlock();
+      }
+      m_right_child.set(child);
+      break;
+    case keyrange::comparison::OVERLAPS:
+      // shouldn't be overlapping, since the tree is
+      // non-overlapping and this range must exist
+      abort();
+  }
+
+  return this;
+}
+
+bool treenode::left_imbalanced(int threshold) const {
+  uint32_t left_depth = m_left_child.depth_est;
+  uint32_t right_depth = m_right_child.depth_est;
+  return m_left_child.ptr != nullptr && left_depth > threshold + right_depth;
+}
+
+bool treenode::right_imbalanced(int threshold) const {
+  uint32_t left_depth = m_left_child.depth_est;
+  uint32_t right_depth = m_right_child.depth_est;
+  return m_right_child.ptr != nullptr && right_depth > threshold + left_depth;
+}
+
+// effect: rebalances the subtree rooted at this node
+//         using AVL style O(1) rotations. unlocks this
+//         node if it is not the new root of the subtree.
+// requires: node is locked by this thread, children are not
+// returns: locked root node of the rebalanced tree
+treenode *treenode::maybe_rebalance(void) {
+  // if we end up not rotating at all, the new root is this
+  treenode *new_root = this;
+  treenode *child = nullptr;
+
+  if (left_imbalanced(IMBALANCE_THRESHOLD)) {
+    child = m_left_child.get_locked();
+    if (child->right_imbalanced(0)) {
+      treenode *grandchild = child->m_right_child.get_locked();
+
+      child->m_right_child = grandchild->m_left_child;
+      grandchild->m_left_child.set(child);
+
+      m_left_child = grandchild->m_right_child;
+      grandchild->m_right_child.set(this);
+
+      new_root = grandchild;
+    } else {
+      m_left_child = child->m_right_child;
+      child->m_right_child.set(this);
+      new_root = child;
+    }
+  } else if (right_imbalanced(IMBALANCE_THRESHOLD)) {
+    child = m_right_child.get_locked();
+    if (child->left_imbalanced(0)) {
+      treenode *grandchild = child->m_left_child.get_locked();
+
+      child->m_left_child = grandchild->m_right_child;
+      grandchild->m_right_child.set(child);
+
+      m_right_child = grandchild->m_left_child;
+      grandchild->m_left_child.set(this);
+
+      new_root = grandchild;
+    } else {
+      m_right_child = child->m_left_child;
+      child->m_left_child.set(this);
+      new_root = child;
+    }
+  }
+
+  // up to three nodes may be locked.
+  // - this
+  // - child
+  // - grandchild (but if it is locked, its the new root)
+  //
+  // one of them is the new root. we unlock everything except the new root.
+  if (child && child != new_root) {
+    TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex);
+    child->mutex_unlock();
+  }
+  if (this != new_root) {
+    TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex);
+    mutex_unlock();
+  }
+  TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex);
+  return new_root;
+}
+
+treenode *treenode::lock_and_rebalance_left(void) {
+  treenode *child = m_left_child.get_locked();
+  if (child) {
+    treenode *new_root = child->maybe_rebalance();
+    m_left_child.set(new_root);
+    child = new_root;
+  }
+  return child;
+}
+
+treenode *treenode::lock_and_rebalance_right(void) {
+  treenode *child = m_right_child.get_locked();
+  if (child) {
+    treenode *new_root = child->maybe_rebalance();
+    m_right_child.set(new_root);
+    child = new_root;
+  }
+  return child;
+}
+
+void treenode::child_ptr::set(treenode *node) {
+  ptr = node;
+  depth_est = ptr ? ptr->get_depth_estimate() : 0;
+}
+
+treenode *treenode::child_ptr::get_locked(void) {
+  if (ptr) {
+    ptr->mutex_lock();
+    depth_est = ptr->get_depth_estimate();
+  }
+  return ptr;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
new file mode 100644
index 000000000..ec25a8c58
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
@@ -0,0 +1,302 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "../ft/comparator.h"
+#include "../portability/memory.h"
+#include "../portability/toku_pthread.h"
+// PORT: we need LTM_STATUS
+#include "../ft/ft-status.h"
+#include "../portability/txn_subst.h"
+#include "keyrange.h"
+
+namespace toku {
+
+// a node in a tree with its own mutex
+// - range is the "key" of this node
+// - txnid is the single txnid associated with this node
+// - left and right children may be null
+//
+// to build a tree on top of this abstraction, the user:
+// - provides memory for a root node, initializes it via create_root()
+// - performs tree operations on the root node. memory management
+//   below the root node is handled by the abstraction, not the user.
+// this pattern:
+// - guaruntees a root node always exists.
+// - does not allow for rebalances on the root node
+
+class treenode {
+ public:
+  // every treenode function has some common requirements:
+  // - node is locked and children are never locked
+  // - node may be unlocked if no other thread has visibility
+
+  // effect: create the root node
+  void create_root(const comparator *cmp);
+
+  // effect: destroys the root node
+  void destroy_root(void);
+
+  // effect: sets the txnid and copies the given range for this node
+  void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared);
+
+  // returns: true iff this node is marked as empty
+  bool is_empty(void);
+
+  // returns: true if this is the root node, denoted by a null parent
+  bool is_root(void);
+
+  // returns: true if the given range overlaps with this node's range
+  bool range_overlaps(const keyrange &range);
+
+  // effect: locks the node
+  void mutex_lock(void);
+
+  // effect: unlocks the node
+  void mutex_unlock(void);
+
+  // return: node whose child overlaps, or a child that is empty
+  //         and would contain range if it existed
+  // given: if cmp_hint is non-null, then it is a precomputed
+  //        comparison of this node's range to the given range.
+  treenode *find_node_with_overlapping_child(
+      const keyrange &range, const keyrange::comparison *cmp_hint);
+
+  // effect: performs an in-order traversal of the ranges that overlap the
+  //         given range, calling function->fn() on each node that does
+  // requires: function signature is: bool fn(const keyrange &range, TXNID
+  // txnid) requires: fn returns true to keep iterating, false to stop iterating
+  // requires: fn does not attempt to use any ranges read out by value
+  //           after removing a node with an overlapping range from the tree.
+  template <class F>
+  void traverse_overlaps(const keyrange &range, F *function) {
+    keyrange::comparison c = range.compare(*m_cmp, m_range);
+    if (c == keyrange::comparison::EQUALS) {
+      // Doesn't matter if fn wants to keep going, there
+      // is nothing left, so return.
+      function->fn(m_range, m_txnid, m_is_shared, m_owners);
+      return;
+    }
+
+    treenode *left = m_left_child.get_locked();
+    if (left) {
+      if (c != keyrange::comparison::GREATER_THAN) {
+        // Target range is less than this node, or it overlaps this
+        // node.  There may be something on the left.
+        left->traverse_overlaps(range, function);
+      }
+      left->mutex_unlock();
+    }
+
+    if (c == keyrange::comparison::OVERLAPS) {
+      bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners);
+      if (!keep_going) {
+        return;
+      }
+    }
+
+    treenode *right = m_right_child.get_locked();
+    if (right) {
+      if (c != keyrange::comparison::LESS_THAN) {
+        // Target range is greater than this node, or it overlaps this
+        // node.  There may be something on the right.
+        right->traverse_overlaps(range, function);
+      }
+      right->mutex_unlock();
+    }
+  }
+
+  // effect: inserts the given range and txnid into a subtree, recursively
+  // requires: range does not overlap with any node below the subtree
+  bool insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+  // effect: removes the given range from the subtree
+  // requires: range exists in the subtree
+  // returns: the root of the resulting subtree
+  treenode *remove(const keyrange &range, TXNID txnid);
+
+  // effect: removes this node and all of its children, recursively
+  // requires: every node at and below this node is unlocked
+  void recursive_remove(void);
+
+ private:
+  // the child_ptr is a light abstraction for the locking of
+  // a child and the maintenence of its depth estimate.
+
+  struct child_ptr {
+    // set the child pointer
+    void set(treenode *node);
+
+    // get and lock this child if it exists
+    treenode *get_locked(void);
+
+    treenode *ptr;
+    uint32_t depth_est;
+  };
+
+  // the balance factor at which a node is considered imbalanced
+  static const int32_t IMBALANCE_THRESHOLD = 2;
+
+  // node-level mutex
+  toku_mutex_t m_mutex;
+
+  // the range and txnid for this node. the range contains a copy
+  // of the keys originally inserted into the tree. nodes may
+  // swap ranges. but at the end of the day, when a node is
+  // destroyed, it frees the memory associated with whatever range
+  // it has at the time of destruction.
+  keyrange m_range;
+
+  void remove_shared_owner(TXNID txnid);
+
+  bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); }
+
+ private:
+  // Owner transaction id.
+  // A value of TXNID_SHARED means this node has multiple owners
+  TXNID m_txnid;
+
+  // If true, this lock is a non-exclusive lock, and it can have either
+  // one or several owners.
+  bool m_is_shared;
+
+  // List of the owners, or nullptr if there's just one owner.
+  TxnidVector *m_owners;
+
+  // two child pointers
+  child_ptr m_left_child;
+  child_ptr m_right_child;
+
+  // comparator for ranges
+  // psergey-todo: Is there any sense to store the comparator in each tree
+  // node?
+  const comparator *m_cmp;
+
+  // marked for the root node. the root node is never free()'d
+  // when removed, but instead marked as empty.
+  bool m_is_root;
+
+  // marked for an empty node. only valid for the root.
+  bool m_is_empty;
+
+  // effect: initializes an empty node with the given comparator
+  void init(const comparator *cmp);
+
+  // requires: this is a shared node (m_is_shared==true)
+  // effect: another transaction is added as an owner.
+  // returns: true <=> added another owner
+  //          false <=> this transaction is already an owner
+  bool add_shared_owner(TXNID txnid);
+
+  // requires: *parent is initialized to something meaningful.
+  // requires: subtree is non-empty
+  // returns: the leftmost child of the given subtree
+  // returns: a pointer to the parent of said child in *parent, only
+  //          if this function recurred, otherwise it is untouched.
+  treenode *find_leftmost_child(treenode **parent);
+
+  // requires: *parent is initialized to something meaningful.
+  // requires: subtree is non-empty
+  // returns: the rightmost child of the given subtree
+  // returns: a pointer to the parent of said child in *parent, only
+  //          if this function recurred, otherwise it is untouched.
+  treenode *find_rightmost_child(treenode **parent);
+
+  // effect: remove the root of this subtree, destroying the old root
+  // returns: the new root of the subtree
+  treenode *remove_root_of_subtree(void);
+
+  // requires: subtree is non-empty, direction is not 0
+  // returns: the child of the subtree at either the left or rightmost extreme
+  treenode *find_child_at_extreme(int direction, treenode **parent);
+
+  // effect: retrieves and possibly rebalances the left child
+  // returns: a locked left child, if it exists
+  treenode *lock_and_rebalance_left(void);
+
+  // effect: retrieves and possibly rebalances the right child
+  // returns: a locked right child, if it exists
+  treenode *lock_and_rebalance_right(void);
+
+  // returns: the estimated depth of this subtree
+  uint32_t get_depth_estimate(void) const;
+
+  // returns: true iff left subtree depth is sufficiently less than the right
+  bool left_imbalanced(int threshold) const;
+
+  // returns: true iff right subtree depth is sufficiently greater than the left
+  bool right_imbalanced(int threshold) const;
+
+  // effect: performs an O(1) rebalance, which will "heal" an imbalance by at
+  // most 1. effect: if the new root is not this node, then this node is
+  // unlocked. returns: locked node representing the new root of the rebalanced
+  // subtree
+  treenode *maybe_rebalance(void);
+
+  // returns: allocated treenode populated with a copy of the range and txnid
+  static treenode *alloc(const comparator *cmp, const keyrange &range,
+                         TXNID txnid, bool is_shared);
+
+  // requires: node is a locked root node, or an unlocked non-root node
+  static void free(treenode *node);
+
+  // effect: swaps the range/txnid pairs for node1 and node2.
+  static void swap_in_place(treenode *node1, treenode *node2);
+
+  friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
new file mode 100644
index 000000000..4caf1e26f
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
@@ -0,0 +1,120 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "txnid_set.h"
+
+#include "../db.h"
+
+namespace toku {
+
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b);
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b) {
+  if (txnid_a < txnid_b) {
+    return -1;
+  } else if (txnid_a == txnid_b) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+void txnid_set::create(void) {
+  // lazily allocate the underlying omt, since it is common
+  // to create a txnid set and never put anything in it.
+  m_txnids.create_no_array();
+}
+
+void txnid_set::destroy(void) { m_txnids.destroy(); }
+
+// Return true if the given transaction id is a member of the set.
+// Otherwise, return false.
+bool txnid_set::contains(TXNID txnid) const {
+  TXNID find_txnid;
+  int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, &find_txnid, nullptr);
+  return r == 0 ? true : false;
+}
+
+// Add a given txnid to the set
+void txnid_set::add(TXNID txnid) {
+  int r = m_txnids.insert<TXNID, find_by_txnid>(txnid, txnid, nullptr);
+  invariant(r == 0 || r == DB_KEYEXIST);
+}
+
+// Delete a given txnid from the set.
+void txnid_set::remove(TXNID txnid) {
+  uint32_t idx;
+  int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, nullptr, &idx);
+  if (r == 0) {
+    r = m_txnids.delete_at(idx);
+    invariant_zero(r);
+  }
+}
+
+// Return the size of the set
+uint32_t txnid_set::size(void) const { return m_txnids.size(); }
+
+// Get the ith id in the set, assuming that the set is sorted.
+TXNID txnid_set::get(uint32_t i) const {
+  TXNID txnid;
+  int r = m_txnids.fetch(i, &txnid);
+  if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+    return TXNID_NONE;
+  invariant_zero(r);
+  return txnid;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
new file mode 100644
index 000000000..d79c24fb0
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
@@ -0,0 +1,92 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../portability/txn_subst.h"
+#include "../util/omt.h"
+
+namespace toku {
+
+class txnid_set {
+ public:
+  // effect: Creates an empty set. Does not malloc space for
+  //         any entries yet. That is done lazily on add().
+  void create(void);
+
+  // effect: Destroy the set's internals.
+  void destroy(void);
+
+  // returns: True if the given txnid is a member of the set.
+  bool contains(TXNID id) const;
+
+  // effect: Adds a given txnid to the set if it did not exist
+  void add(TXNID txnid);
+
+  // effect: Deletes a txnid from the set if it exists.
+  void remove(TXNID txnid);
+
+  // returns: Size of the set
+  uint32_t size(void) const;
+
+  // returns: The "i'th" id in the set, as if it were sorted.
+  TXNID get(uint32_t i) const;
+
+ private:
+  toku::omt<TXNID> m_txnids;
+
+  friend class txnid_set_unit_test;
+};
+ENSURE_POD(txnid_set);
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
new file mode 100644
index 000000000..24536c88e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
@@ -0,0 +1,213 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "../db.h"
+#include "../portability/memory.h"
+// PORT #include <toku_assert.h>
+#include <memory.h>
+#include <string.h>
+
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Create a lock request graph
+void wfg::create(void) { m_nodes.create(); }
+
+// Destroy the internals of the lock request graph
+void wfg::destroy(void) {
+  uint32_t n_nodes = m_nodes.size();
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    node *n;
+    int r = m_nodes.fetch(i, &n);
+    invariant_zero(r);
+    invariant_notnull(n);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    node::free(n);
+  }
+  m_nodes.destroy();
+}
+
+// Add an edge (a_id, b_id) to the graph
+void wfg::add_edge(TXNID a_txnid, TXNID b_txnid) {
+  node *a_node = find_create_node(a_txnid);
+  node *b_node = find_create_node(b_txnid);
+  a_node->edges.add(b_node->txnid);
+}
+
+// Return true if a node with the given transaction id exists in the graph.
+// Return false otherwise.
+bool wfg::node_exists(TXNID txnid) {
+  node *n = find_node(txnid);
+  return n != NULL;
+}
+
+bool wfg::cycle_exists_from_node(node *target, node *head,
+                                 std::function<void(TXNID)> reporter) {
+  bool cycle_found = false;
+  head->visited = true;
+  uint32_t n_edges = head->edges.size();
+  for (uint32_t i = 0; i < n_edges && !cycle_found; i++) {
+    TXNID edge_id = head->edges.get(i);
+    if (target->txnid == edge_id) {
+      cycle_found = true;
+      if (reporter) reporter(edge_id);
+    } else {
+      node *new_head = find_node(edge_id);
+      if (new_head && !new_head->visited) {
+        cycle_found = cycle_exists_from_node(target, new_head, reporter);
+        if (cycle_found && reporter) reporter(edge_id);
+      }
+    }
+  }
+  head->visited = false;
+  return cycle_found;
+}
+
+// Return true if there exists a cycle from a given transaction id in the graph.
+// Return false otherwise.
+bool wfg::cycle_exists_from_txnid(TXNID txnid,
+                                  std::function<void(TXNID)> reporter) {
+  node *a_node = find_node(txnid);
+  bool cycles_found = false;
+  if (a_node) {
+    cycles_found = cycle_exists_from_node(a_node, a_node, reporter);
+  }
+  return cycles_found;
+}
+
+// Apply a given function f to all of the nodes in the graph.  The apply
+// function returns when the function f is called for all of the nodes in the
+// graph, or the function f returns non-zero.
+void wfg::apply_nodes(int (*fn)(TXNID id, void *extra), void *extra) {
+  int r = 0;
+  uint32_t n_nodes = m_nodes.size();
+  for (uint32_t i = 0; i < n_nodes && r == 0; i++) {
+    node *n;
+    r = m_nodes.fetch(i, &n);
+    invariant_zero(r);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    r = fn(n->txnid, extra);
+  }
+}
+
+// Apply a given function f to all of the edges whose origin is a given node id.
+// The apply function returns when the function f is called for all edges in the
+// graph rooted at node id, or the function f returns non-zero.
+void wfg::apply_edges(TXNID txnid,
+                      int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+                      void *extra) {
+  node *n = find_node(txnid);
+  if (n) {
+    int r = 0;
+    uint32_t n_edges = n->edges.size();
+    for (uint32_t i = 0; i < n_edges && r == 0; i++) {
+      r = fn(txnid, n->edges.get(i), extra);
+    }
+  }
+}
+
+// find node by id
+wfg::node *wfg::find_node(TXNID txnid) {
+  node *n = nullptr;
+  int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, nullptr);
+  invariant(r == 0 || r == DB_NOTFOUND);
+  return n;
+}
+
+// this is the omt comparison function
+// nodes are compared by their txnid.
+int wfg::find_by_txnid(node *const &node_a, const TXNID &txnid_b) {
+  TXNID txnid_a = node_a->txnid;
+  if (txnid_a < txnid_b) {
+    return -1;
+  } else if (txnid_a == txnid_b) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+// insert a new node
+wfg::node *wfg::find_create_node(TXNID txnid) {
+  node *n;
+  uint32_t idx;
+  int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, &idx);
+  if (r == DB_NOTFOUND) {
+    n = node::alloc(txnid);
+    r = m_nodes.insert_at(n, idx);
+    invariant_zero(r);
+  }
+  invariant_notnull(n);
+  return n;
+}
+
+wfg::node *wfg::node::alloc(TXNID txnid) {
+  node *XCALLOC(n);
+  n->txnid = txnid;
+  n->visited = false;
+  n->edges.create();
+  return n;
+}
+
+void wfg::node::free(wfg::node *n) {
+  n->edges.destroy();
+  toku_free(n);
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
new file mode 100644
index 000000000..804202170
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
@@ -0,0 +1,124 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <functional>
+
+#include "../util/omt.h"
+#include "txnid_set.h"
+
+namespace toku {
+
+// A wfg is a 'wait-for' graph. A directed edge in represents one
+// txn waiting for another to finish before it can acquire a lock.
+
+class wfg {
+ public:
+  // Create a lock request graph
+  void create(void);
+
+  // Destroy the internals of the lock request graph
+  void destroy(void);
+
+  // Add an edge (a_id, b_id) to the graph
+  void add_edge(TXNID a_txnid, TXNID b_txnid);
+
+  // Return true if a node with the given transaction id exists in the graph.
+  // Return false otherwise.
+  bool node_exists(TXNID txnid);
+
+  // Return true if there exists a cycle from a given transaction id in the
+  // graph. Return false otherwise.
+  bool cycle_exists_from_txnid(TXNID txnid,
+                               std::function<void(TXNID)> reporter);
+
+  // Apply a given function f to all of the nodes in the graph.  The apply
+  // function returns when the function f is called for all of the nodes in the
+  // graph, or the function f returns non-zero.
+  void apply_nodes(int (*fn)(TXNID txnid, void *extra), void *extra);
+
+  // Apply a given function f to all of the edges whose origin is a given node
+  // id. The apply function returns when the function f is called for all edges
+  // in the graph rooted at node id, or the function f returns non-zero.
+  void apply_edges(TXNID txnid,
+                   int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+                   void *extra);
+
+ private:
+  struct node {
+    // txnid for this node and the associated set of edges
+    TXNID txnid;
+    txnid_set edges;
+    bool visited;
+
+    static node *alloc(TXNID txnid);
+
+    static void free(node *n);
+  };
+  ENSURE_POD(node);
+
+  toku::omt<node *> m_nodes;
+
+  node *find_node(TXNID txnid);
+
+  node *find_create_node(TXNID txnid);
+
+  bool cycle_exists_from_node(node *target, node *head,
+                              std::function<void(TXNID)> reporter);
+
+  static int find_by_txnid(node *const &node_a, const TXNID &txnid_b);
+};
+ENSURE_POD(wfg);
+
+} /* namespace toku */
-- 
cgit v1.2.3