Adding upstream version 1:10.5.12.upstream/1%10.5.12 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:07:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:07:14 +0000
commit: a175314c3e5827eb193872241446f2f8f5c9d33c (patch)
tree: cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/tokudb/PerconaFT/util
parent: Initial commit. (diff)
download: mariadb-10.5-upstream/1%10.5.12.tar.xz
mariadb-10.5-upstream/1%10.5.12.zip
64 files changed, 14712 insertions, 0 deletions
diff --git a/storage/tokudb/PerconaFT/util/CMakeLists.txt b/storage/tokudb/PerconaFT/util/CMakeLists.txt
new file mode 100644
index 00000000..6f6b899e
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(util_srcs
+  context
+  dbt
+  frwlock
+  kibbutz
+  memarena
+  mempool
+  minicron
+  partitioned_counter
+  queue
+  threadpool
+  scoped_malloc
+  x1764
+  )
+
+add_library(util SHARED ${util_srcs})
+add_library(util_static STATIC ${util_srcs})
+maybe_add_gcov_to_libraries(util util_static)
+set_target_properties(util_static PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_link_libraries(util LINK_PRIVATE ${LIBTOKUPORTABILITY})
+target_link_libraries(util LINK_PUBLIC ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS})
+add_dependencies(util install_tdb_h)
+add_dependencies(util_static install_tdb_h)
+
+# detect when we are being built as a subproject
+if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING)
+  install(
+    FILES partitioned_counter.h
+    DESTINATION include
+    COMPONENT tokukv_headers
+    )
+endif ()
+
+add_subdirectory(tests)
diff --git a/storage/tokudb/PerconaFT/util/bytestring.h b/storage/tokudb/PerconaFT/util/bytestring.h
new file mode 100644
index 00000000..f946ad60
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/bytestring.h
@@ -0,0 +1,46 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "portability/toku_stdint.h"
+
+struct BYTESTRING {
+    uint32_t len;
+    char *data;
+};
diff --git a/storage/tokudb/PerconaFT/util/constexpr.h b/storage/tokudb/PerconaFT/util/constexpr.h
new file mode 100644
index 00000000..fce2cf3a
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/constexpr.h
@@ -0,0 +1,52 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+constexpr char UU() static_tolower(const char a) {
+    return a >= 'A' && a <= 'Z' ? a - 'A' + 'a' : a;
+}
+
+constexpr int UU() static_strncasecmp(const char *a, const char *b, size_t len) {
+    return len == 0 ? 0 : (
+         static_tolower(*a) != static_tolower(*b)  || *a == '\0' ?
+         static_tolower(*a) - static_tolower(*b) :
+         static_strncasecmp(a+1, b+1, len-1)
+        );
+}
+
diff --git a/storage/tokudb/PerconaFT/util/context.cc b/storage/tokudb/PerconaFT/util/context.cc
new file mode 100644
index 00000000..dafe4e84
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/context.cc
@@ -0,0 +1,184 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <string.h>
+
+#include <util/context.h>
+
+namespace toku {
+
+    static const context default_context(CTX_DEFAULT);
+    static __thread const context *tl_current_context = &default_context;
+
+    // save the old context, set the current context
+    context::context(const context_id id) :
+        m_old_ctx(tl_current_context),
+        m_id(id) {
+        tl_current_context = this;
+    }
+
+    // restore the old context
+    context::~context() {
+        tl_current_context = m_old_ctx;
+    }
+
+} // namespace toku
+
+// thread local context
+
+const toku::context *toku_thread_get_context() {
+    return toku::tl_current_context;
+}
+
+// engine status
+
+static struct context_status context_status;
+#define CONTEXT_STATUS_INIT(key, legend) TOKUFT_STATUS_INIT(context_status, key, nullptr, PARCOUNT, "context: " legend, TOKU_ENGINE_STATUS)
+
+void toku_context_status_init(void) {
+    CONTEXT_STATUS_INIT(CTX_SEARCH_BLOCKED_BY_FULL_FETCH,           "tree traversals blocked by a full fetch");
+    CONTEXT_STATUS_INIT(CTX_SEARCH_BLOCKED_BY_PARTIAL_FETCH,        "tree traversals blocked by a partial fetch");
+    CONTEXT_STATUS_INIT(CTX_SEARCH_BLOCKED_BY_FULL_EVICTION,        "tree traversals blocked by a full eviction");
+    CONTEXT_STATUS_INIT(CTX_SEARCH_BLOCKED_BY_PARTIAL_EVICTION,     "tree traversals blocked by a partial eviction");
+    CONTEXT_STATUS_INIT(CTX_SEARCH_BLOCKED_BY_MESSAGE_INJECTION,    "tree traversals blocked by a message injection");
+    CONTEXT_STATUS_INIT(CTX_SEARCH_BLOCKED_BY_MESSAGE_APPLICATION,  "tree traversals blocked by a message application");
+    CONTEXT_STATUS_INIT(CTX_SEARCH_BLOCKED_BY_FLUSH,                "tree traversals blocked by a flush");
+    CONTEXT_STATUS_INIT(CTX_SEARCH_BLOCKED_BY_CLEANER,              "tree traversals blocked by a the cleaner thread");
+    CONTEXT_STATUS_INIT(CTX_SEARCH_BLOCKED_OTHER,                   "tree traversals blocked by something uninstrumented");
+    CONTEXT_STATUS_INIT(CTX_PROMO_BLOCKED_BY_FULL_FETCH,            "promotion blocked by a full fetch (should never happen)");
+    CONTEXT_STATUS_INIT(CTX_PROMO_BLOCKED_BY_PARTIAL_FETCH,         "promotion blocked by a partial fetch (should never happen)");
+    CONTEXT_STATUS_INIT(CTX_PROMO_BLOCKED_BY_FULL_EVICTION,         "promotion blocked by a full eviction (should never happen)");
+    CONTEXT_STATUS_INIT(CTX_PROMO_BLOCKED_BY_PARTIAL_EVICTION,      "promotion blocked by a partial eviction (should never happen)");
+    CONTEXT_STATUS_INIT(CTX_PROMO_BLOCKED_BY_MESSAGE_INJECTION,     "promotion blocked by a message injection");
+    CONTEXT_STATUS_INIT(CTX_PROMO_BLOCKED_BY_MESSAGE_APPLICATION,   "promotion blocked by a message application");
+    CONTEXT_STATUS_INIT(CTX_PROMO_BLOCKED_BY_FLUSH,                 "promotion blocked by a flush");
+    CONTEXT_STATUS_INIT(CTX_PROMO_BLOCKED_BY_CLEANER,               "promotion blocked by the cleaner thread");
+    CONTEXT_STATUS_INIT(CTX_PROMO_BLOCKED_OTHER,                    "promotion blocked by something uninstrumented");
+    CONTEXT_STATUS_INIT(CTX_BLOCKED_OTHER,                          "something uninstrumented blocked by something uninstrumented");
+    context_status.initialized = true;
+}
+#undef FS_STATUS_INIT
+
+void toku_context_get_status(struct context_status *status) {
+    assert(context_status.initialized);
+    *status = context_status;
+}
+
+#define STATUS_INC(x, d) increment_partitioned_counter(context_status.status[x].value.parcount, d);
+
+void toku_context_note_frwlock_contention(const context_id blocked, const context_id blocking) {
+    assert(context_status.initialized);
+    if (blocked != CTX_SEARCH && blocked != CTX_PROMO) {
+        // Return early if this event is "unknown"
+        STATUS_INC(CTX_BLOCKED_OTHER, 1);
+        return;
+    }
+    switch (blocking) {
+    case CTX_FULL_FETCH:
+        if (blocked == CTX_SEARCH) {
+            STATUS_INC(CTX_SEARCH_BLOCKED_BY_FULL_FETCH, 1);
+        } else if (blocked == CTX_PROMO) {
+            STATUS_INC(CTX_PROMO_BLOCKED_BY_FULL_FETCH, 1);
+        }
+        break;
+    case CTX_PARTIAL_FETCH:
+        if (blocked == CTX_SEARCH) {
+            STATUS_INC(CTX_SEARCH_BLOCKED_BY_PARTIAL_FETCH, 1);
+        } else if (blocked == CTX_PROMO) {
+            STATUS_INC(CTX_PROMO_BLOCKED_BY_PARTIAL_FETCH, 1);
+        }
+        break;
+    case CTX_FULL_EVICTION:
+        if (blocked == CTX_SEARCH) {
+            STATUS_INC(CTX_SEARCH_BLOCKED_BY_FULL_EVICTION, 1);
+        } else if (blocked == CTX_PROMO) {
+            STATUS_INC(CTX_PROMO_BLOCKED_BY_FULL_EVICTION, 1);
+        }
+        break;
+    case CTX_PARTIAL_EVICTION:
+        if (blocked == CTX_SEARCH) {
+            STATUS_INC(CTX_SEARCH_BLOCKED_BY_PARTIAL_EVICTION, 1);
+        } else if (blocked == CTX_PROMO) {
+            STATUS_INC(CTX_PROMO_BLOCKED_BY_PARTIAL_EVICTION, 1);
+        }
+        break;
+    case CTX_MESSAGE_INJECTION:
+        if (blocked == CTX_SEARCH) {
+            STATUS_INC(CTX_SEARCH_BLOCKED_BY_MESSAGE_INJECTION, 1);
+        } else if (blocked == CTX_PROMO) {
+            STATUS_INC(CTX_PROMO_BLOCKED_BY_MESSAGE_INJECTION, 1);
+        }
+        break;
+    case CTX_MESSAGE_APPLICATION:
+        if (blocked == CTX_SEARCH) {
+            STATUS_INC(CTX_SEARCH_BLOCKED_BY_MESSAGE_APPLICATION, 1);
+        } else if (blocked == CTX_PROMO) {
+            STATUS_INC(CTX_PROMO_BLOCKED_BY_MESSAGE_APPLICATION, 1);
+        }
+        break;
+    case CTX_FLUSH:
+        if (blocked == CTX_SEARCH) {
+            STATUS_INC(CTX_SEARCH_BLOCKED_BY_FLUSH, 1);
+        } else if (blocked == CTX_PROMO) {
+            STATUS_INC(CTX_PROMO_BLOCKED_BY_FLUSH, 1);
+        }
+        break;
+    case CTX_CLEANER:
+        if (blocked == CTX_SEARCH) {
+            STATUS_INC(CTX_SEARCH_BLOCKED_BY_CLEANER, 1);
+        } else if (blocked == CTX_PROMO) {
+            STATUS_INC(CTX_PROMO_BLOCKED_BY_CLEANER, 1);
+        }
+        break;
+    default:
+        if (blocked == CTX_SEARCH) {
+            STATUS_INC(CTX_SEARCH_BLOCKED_OTHER, 1);
+        } else if (blocked == CTX_PROMO) {
+            STATUS_INC(CTX_PROMO_BLOCKED_OTHER, 1);
+        }
+        break;
+    }
+}
+
+void toku_context_status_destroy(void) {
+    for (int i = 0; i < CTX_STATUS_NUM_ROWS; ++i) {
+        if (context_status.status[i].type == PARCOUNT) {
+            destroy_partitioned_counter(context_status.status[i].value.parcount);
+        }
+    }
+}
diff --git a/storage/tokudb/PerconaFT/util/context.h b/storage/tokudb/PerconaFT/util/context.h
new file mode 100644
index 00000000..de4d2076
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/context.h
@@ -0,0 +1,152 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <portability/toku_portability.h>
+
+#include <db.h>
+
+#include <util/status.h>
+
+enum context_id {
+    CTX_INVALID = -1,
+    CTX_DEFAULT = 0,          // default context for when no context is set
+    CTX_SEARCH,               // searching for a key at the bottom of the tree
+    CTX_PROMO,                // promoting a message down the tree
+    CTX_FULL_FETCH,           // performing full fetch (pivots + some partial fetch)
+    CTX_PARTIAL_FETCH,        // performing partial fetch
+    CTX_FULL_EVICTION,        // running partial eviction
+    CTX_PARTIAL_EVICTION,     // running partial eviction
+    CTX_MESSAGE_INJECTION,    // injecting a message into a buffer
+    CTX_MESSAGE_APPLICATION,  // applying ancestor's messages to a basement node
+    CTX_FLUSH,                // flushing a buffer
+    CTX_CLEANER               // doing work as the cleaner thread
+};
+
+// Note a contention event in engine status
+void toku_context_note_frwlock_contention(const context_id blocking, const context_id blocked);
+
+namespace toku {
+
+    // class for tracking what a thread is doing
+    //
+    // usage:
+    //
+    // // automatically tag and document what you're doing
+    // void my_interesting_function(void) {
+    //     toku::context ctx("doing something interesting", INTERESTING_FN_1);
+    //     ...
+    //     {
+    //         toku::context inner_ctx("doing something expensive", EXPENSIVE_FN_1);
+    //         my_rwlock.wrlock();
+    //         expensive();
+    //         my_rwlock.wrunlock();
+    //     }
+    //     ...
+    // }
+    //
+    // // ... so later you can write code like this.
+    // // here, we save some info to help determine why a lock could not be acquired
+    // void my_rwlock::wrlock() {
+    //     r = try_acquire_write_lock();
+    //     if (r == 0) {
+    //         m_write_locked_context_id = get_thread_local_context()->get_id();
+    //         ...
+    //     } else {
+    //         if (m_write_locked_context_id == EXPENSIVE_FN_1) {
+    //             status.blocked_because_of_expensive_fn_1++;
+    //         } else if (...) {
+    //            ...
+    //         }
+    //         ...
+    //     }
+    // }
+    class context {
+    public:
+        context(const context_id id); 
+
+        ~context();
+
+        context_id get_id() const {
+            return m_id;
+        }
+
+    private:
+        // each thread has a stack of contexts, rooted at the trivial "root context"
+        const context *m_old_ctx;
+        const context_id m_id;
+    };
+
+} // namespace toku
+
+// Get the current context of this thread
+const toku::context *toku_thread_get_context();
+
+enum context_status_entry {
+    CTX_SEARCH_BLOCKED_BY_FULL_FETCH = 0,
+    CTX_SEARCH_BLOCKED_BY_PARTIAL_FETCH,
+    CTX_SEARCH_BLOCKED_BY_FULL_EVICTION,
+    CTX_SEARCH_BLOCKED_BY_PARTIAL_EVICTION,
+    CTX_SEARCH_BLOCKED_BY_MESSAGE_INJECTION,
+    CTX_SEARCH_BLOCKED_BY_MESSAGE_APPLICATION,
+    CTX_SEARCH_BLOCKED_BY_FLUSH,
+    CTX_SEARCH_BLOCKED_BY_CLEANER,
+    CTX_SEARCH_BLOCKED_OTHER,
+    CTX_PROMO_BLOCKED_BY_FULL_FETCH,
+    CTX_PROMO_BLOCKED_BY_PARTIAL_FETCH,
+    CTX_PROMO_BLOCKED_BY_FULL_EVICTION,
+    CTX_PROMO_BLOCKED_BY_PARTIAL_EVICTION,
+    CTX_PROMO_BLOCKED_BY_MESSAGE_INJECTION,
+    CTX_PROMO_BLOCKED_BY_MESSAGE_APPLICATION,
+    CTX_PROMO_BLOCKED_BY_FLUSH,
+    CTX_PROMO_BLOCKED_BY_CLEANER,
+    CTX_PROMO_BLOCKED_OTHER,
+    CTX_BLOCKED_OTHER,
+    CTX_STATUS_NUM_ROWS
+};
+
+struct context_status {
+    bool initialized;
+    TOKU_ENGINE_STATUS_ROW_S status[CTX_STATUS_NUM_ROWS];
+};
+
+void toku_context_get_status(struct context_status *status);
+
+void toku_context_status_init(void);
+void toku_context_status_destroy(void);
diff --git a/storage/tokudb/PerconaFT/util/dbt.cc b/storage/tokudb/PerconaFT/util/dbt.cc
new file mode 100644
index 00000000..b6d2a584
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/dbt.cc
@@ -0,0 +1,291 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <db.h>
+#include <string.h>
+
+#include "portability/memory.h"
+
+#include "util/dbt.h"
+
+DBT *toku_init_dbt(DBT *dbt) {
+    memset(dbt, 0, sizeof(*dbt));
+    return dbt;
+}
+
+DBT toku_empty_dbt(void) {
+    static const DBT empty_dbt = { .data = 0, .size = 0, .ulen = 0, .flags = 0 };
+    return empty_dbt;
+}
+
+DBT *toku_init_dbt_flags(DBT *dbt, uint32_t flags) {
+    toku_init_dbt(dbt);
+    dbt->flags = flags;
+    return dbt;
+}
+
+DBT_ARRAY *toku_dbt_array_init(DBT_ARRAY *dbts, uint32_t size) {
+    uint32_t capacity = 1;
+    while (capacity < size) { capacity *= 2; }
+
+    XMALLOC_N(capacity, dbts->dbts);
+    for (uint32_t i = 0; i < capacity; i++) {
+        toku_init_dbt_flags(&dbts->dbts[i], DB_DBT_REALLOC);
+    }
+    dbts->size = size;
+    dbts->capacity = capacity;
+    return dbts;
+}
+
+void toku_dbt_array_resize(DBT_ARRAY *dbts, uint32_t size) {
+    if (size != dbts->size) {
+        if (size > dbts->capacity) {
+            const uint32_t old_capacity = dbts->capacity;
+            uint32_t new_capacity = dbts->capacity;
+            while (new_capacity < size) {
+                new_capacity *= 2;
+            }
+            dbts->capacity = new_capacity;
+            XREALLOC_N(new_capacity, dbts->dbts);
+            for (uint32_t i = old_capacity; i < new_capacity; i++) {
+                toku_init_dbt_flags(&dbts->dbts[i], DB_DBT_REALLOC);
+            }
+        } else if (size < dbts->size) {
+            if (dbts->capacity >= 8 && size < dbts->capacity / 4) {
+                const int old_capacity = dbts->capacity;
+                const int new_capacity = dbts->capacity / 2;
+                for (int i = new_capacity; i < old_capacity; i++) {
+                    toku_destroy_dbt(&dbts->dbts[i]);
+                }
+                XREALLOC_N(new_capacity, dbts->dbts);
+                dbts->capacity = new_capacity;
+            }
+        }
+        dbts->size = size;
+    }
+}
+
+void toku_dbt_array_destroy_shallow(DBT_ARRAY *dbts) {
+    toku_free(dbts->dbts);
+    ZERO_STRUCT(*dbts);
+}
+
+void toku_dbt_array_destroy(DBT_ARRAY *dbts) {
+    for (uint32_t i = 0; i < dbts->capacity; i++) {
+        toku_destroy_dbt(&dbts->dbts[i]);
+    }
+    toku_dbt_array_destroy_shallow(dbts);
+}
+
+
+
+void toku_destroy_dbt(DBT *dbt) {
+    switch (dbt->flags) {
+    case DB_DBT_MALLOC:
+    case DB_DBT_REALLOC:
+        toku_free(dbt->data);
+        toku_init_dbt(dbt);
+        break;
+    }
+}
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, uint32_t len) {
+    toku_init_dbt(dbt);
+    dbt->size=len;
+    dbt->data=(char*)k;
+    return dbt;
+}
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len) {
+    toku_init_dbt_flags(dbt, DB_DBT_MALLOC);
+    dbt->size = len;
+    dbt->data = toku_xmemdup(k, len);
+    return dbt;
+}
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src) {
+    dst->flags = 0;
+    dst->ulen = 0;
+    dst->size = src.size;
+    dst->data = src.data;
+    return dst;
+}
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src) {
+    return toku_memdup_dbt(dst, src.data, src.size);
+}
+
+void
+toku_sdbt_cleanup(struct simple_dbt *sdbt) {
+    if (sdbt->data) toku_free(sdbt->data);
+    memset(sdbt, 0, sizeof(*sdbt));
+}
+
+static inline int sdbt_realloc(struct simple_dbt *sdbt) {
+    void *new_data = toku_realloc(sdbt->data, sdbt->len);
+    int r;
+    if (new_data == NULL) {
+        r = get_error_errno();
+    } else {
+        sdbt->data = new_data;
+        r = 0;
+    }
+    return r;
+}
+
+static inline int dbt_realloc(DBT *dbt) {
+    void *new_data = toku_realloc(dbt->data, dbt->ulen);
+    int r;
+    if (new_data == NULL) {
+        r = get_error_errno();
+    } else {
+        dbt->data = new_data;
+        r = 0;
+    }
+    return r;
+}
+
+// sdbt is the static value used when flags==0
+// Otherwise malloc or use the user-supplied memory, as according to the flags in d->flags.
+int toku_dbt_set(uint32_t len, const void *val, DBT *d, struct simple_dbt *sdbt) {
+    int r;
+    if (d == nullptr) {
+        r = 0;
+    } else {
+        switch (d->flags) {
+        case (DB_DBT_USERMEM):
+            d->size = len;
+            if (d->ulen<len) r = DB_BUFFER_SMALL;
+            else {
+                memcpy(d->data, val, len);
+                r = 0;
+            }
+            break;
+        case (DB_DBT_MALLOC):
+            d->data = NULL;
+            d->ulen = 0;
+            // fallthrough
+            // to DB_DBT_REALLOC
+        case (DB_DBT_REALLOC):
+            if (d->ulen < len) {
+                d->ulen = len*2;
+                r = dbt_realloc(d);
+            }
+            else if (d->ulen > 16 && d->ulen > len*4) {
+                d->ulen = len*2 < 16 ? 16 : len*2;
+                r = dbt_realloc(d);
+            }
+            else if (d->data==NULL) {
+                d->ulen = len;
+                r = dbt_realloc(d);
+            }
+            else r=0;
+
+            if (r==0) {
+                memcpy(d->data, val, len);
+                d->size = len;
+            }
+            break;
+        case (0):
+            if (sdbt->len < len) {
+                sdbt->len = len*2;
+                r = sdbt_realloc(sdbt);
+            }
+            else if (sdbt->len > 16 && sdbt->len > len*4) {
+                sdbt->len = len*2 < 16 ? 16 : len*2;
+                r = sdbt_realloc(sdbt);
+            }
+            else r=0;
+
+            if (r==0) {
+                memcpy(sdbt->data, val, len);
+                d->data = sdbt->data;
+                d->size = len;
+            }
+            break;
+        default:
+            r = EINVAL;
+            break;
+        }
+    }
+    return r;
+}
+
+const DBT *toku_dbt_positive_infinity(void) {
+    static DBT positive_infinity_dbt = {};
+    return &positive_infinity_dbt;
+}
+
+const DBT *toku_dbt_negative_infinity(void) {
+    static DBT negative_infinity_dbt = {};
+    return &negative_infinity_dbt;
+}
+
+bool toku_dbt_is_infinite(const DBT *dbt) {
+    return dbt == toku_dbt_positive_infinity() || dbt == toku_dbt_negative_infinity();
+}
+
+bool toku_dbt_is_empty(const DBT *dbt) {
+    // can't have a null data field with a non-zero size
+    paranoid_invariant(dbt->data != nullptr || dbt->size == 0);
+    return dbt->data == nullptr;
+}
+
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b) {
+    if (a == b) {
+        return 0;
+    } else if (a == toku_dbt_positive_infinity()) {
+        return 1;
+    } else if (b == toku_dbt_positive_infinity()) {
+        return -1;
+    } else if (a == toku_dbt_negative_infinity()) {
+        return -1;
+    } else {
+        invariant(b == toku_dbt_negative_infinity());     
+        return 1;
+    }
+}
+
+bool toku_dbt_equals(const DBT *a, const DBT *b) {
+    if (!toku_dbt_is_infinite(a) && !toku_dbt_is_infinite(b)) {
+        return a->data == b->data && a->size == b->size;
+    } else {
+        // a or b is infinite, so they're equal if they are the same infinite
+        return a == b ? true : false;
+    }
+}
diff --git a/storage/tokudb/PerconaFT/util/dbt.h b/storage/tokudb/PerconaFT/util/dbt.h
new file mode 100644
index 00000000..1b837567
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/dbt.h
@@ -0,0 +1,101 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <db.h>
+
+// TODO: John
+// Document this API a little better so that DBT
+// memory management can be morm widely understood.
+
+DBT *toku_init_dbt(DBT *);
+
+// returns: an initialized but empty dbt (for which toku_dbt_is_empty() is true)
+DBT toku_empty_dbt(void);
+
+DBT *toku_init_dbt_flags(DBT *, uint32_t flags);
+
+void toku_destroy_dbt(DBT *);
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, uint32_t len);
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len);
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src);
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src);
+
+int toku_dbt_set(uint32_t len, const void *val, DBT *d, struct simple_dbt *sdbt);
+
+int toku_dbt_set_value(DBT *, const void **val, uint32_t vallen, void **staticptrp, bool dbt1_disposable);
+
+void toku_sdbt_cleanup(struct simple_dbt *sdbt);
+
+// returns: special DBT pointer representing positive infinity
+const DBT *toku_dbt_positive_infinity(void);
+
+// returns: special DBT pointer representing negative infinity
+const DBT *toku_dbt_negative_infinity(void);
+
+// returns: true if the given dbt is either positive or negative infinity
+bool toku_dbt_is_infinite(const DBT *dbt);
+
+// returns: true if the given dbt has no data (ie: dbt->data == nullptr)
+bool toku_dbt_is_empty(const DBT *dbt);
+
+// effect: compares two potentially infinity-valued dbts
+// requires: at least one is infinite (assert otherwise)
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b);
+
+// returns: true if the given dbts have the same data pointer and size
+bool toku_dbt_equals(const DBT *a, const DBT *b);
diff --git a/storage/tokudb/PerconaFT/util/dmt.cc b/storage/tokudb/PerconaFT/util/dmt.cc
new file mode 100644
index 00000000..a584bf2b
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/dmt.cc
@@ -0,0 +1,1213 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <string.h>
+#include <db.h>
+
+#include <portability/memory.h>
+#include <limits.h>
+
+namespace toku {
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::create(void) {
+    toku_mempool_zero(&this->mp);
+    this->values_same_size = true;
+    this->value_length = 0;
+    this->is_array = true;
+    this->d.a.num_values = 0;
+    //TODO: maybe allocate enough space for something by default?
+    //      We may be relying on not needing to allocate space the first time (due to limited time spent while a lock is held)
+}
+
+/**
+ * Note: create_from_sorted_memory_of_fixed_size_elements does not take ownership of 'mem'.
+ * Owner is still responsible for freeing it.
+ * While in the OMT a similar function would steal ownership, this doesn't make sense for the DMT because
+ * we (usually) have to add padding for alignment (mem has all of the elements PACKED).
+ * Also all current uses (as of Jan 12, 2014) of this function would require mallocing a new array
+ * in order to allow stealing.
+ */
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::create_from_sorted_memory_of_fixed_size_elements(
+        const void *mem,
+        const uint32_t numvalues,
+        const uint32_t mem_length,
+        const uint32_t fixed_value_length) {
+    this->values_same_size = true;
+    this->value_length = fixed_value_length;
+    this->is_array = true;
+    this->d.a.num_values = numvalues;
+    const uint8_t pad_bytes = get_fixed_length_alignment_overhead();
+    uint32_t aligned_memsize = mem_length + numvalues * pad_bytes;
+    toku_mempool_construct(&this->mp, aligned_memsize);
+    if (aligned_memsize > 0) {
+        paranoid_invariant(numvalues > 0);
+        void *ptr = toku_mempool_malloc(&this->mp, aligned_memsize);
+        paranoid_invariant_notnull(ptr);
+        uint8_t * const dest = static_cast<uint8_t *>(ptr);
+        const uint8_t * const src = static_cast<const uint8_t *>(mem);
+        if (pad_bytes == 0) {
+            paranoid_invariant(aligned_memsize == mem_length);
+            memcpy(dest, src, aligned_memsize);
+        } else {
+            // TODO(leif): check what vectorizes best: multiplying like this or adding to offsets
+            const uint32_t fixed_len = this->value_length;
+            const uint32_t fixed_aligned_len = align(this->value_length);
+            paranoid_invariant(this->d.a.num_values*fixed_len == mem_length);
+            for (uint32_t i = 0; i < this->d.a.num_values; i++) {
+                memcpy(&dest[i*fixed_aligned_len], &src[i*fixed_len], fixed_len);
+            }
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::clone(const dmt &src) {
+    *this = src;
+    toku_mempool_clone(&src.mp, &this->mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::clear(void) {
+    this->is_array = true;
+    this->d.a.num_values = 0;
+    this->values_same_size = true;  // Reset state
+    this->value_length = 0;
+    //TODO(leif): Note that this can mess with our memory_footprint calculation (we may touch past what is marked as 'used' in the mempool)
+    //            One 'fix' is for mempool to also track what was touched, and reset() shouldn't reset that, though realloc() might.
+    toku_mempool_reset(&this->mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::destroy(void) {
+    this->clear();
+    toku_mempool_destroy(&this->mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::size(void) const {
+    if (this->is_array) {
+        return this->d.a.num_values;
+    } else {
+        return this->nweight(this->d.t.root);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::nweight(const subtree &subtree) const {
+    if (subtree.is_null()) {
+        return 0;
+    } else {
+        const dmt_node & node = get_node(subtree);
+        return node.weight;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename dmtcmp_t, int (*h)(const uint32_t size, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::insert(const dmtwriter_t &value, const dmtcmp_t &v, uint32_t *const idx) {
+    int r;
+    uint32_t insert_idx;
+
+    r = this->find_zero<dmtcmp_t, h>(v, nullptr, nullptr, &insert_idx);
+    if (r==0) {
+        if (idx) *idx = insert_idx;
+        return DB_KEYEXIST;
+    }
+    if (r != DB_NOTFOUND) return r;
+
+    if ((r = this->insert_at(value, insert_idx))) return r;
+    if (idx) *idx = insert_idx;
+
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::insert_at(const dmtwriter_t &value, const uint32_t idx) {
+    if (idx > this->size()) { return EINVAL; }
+
+    bool same_size = this->values_same_size && (this->size() == 0 || value.get_size() == this->value_length);
+    if (this->is_array) {
+        if (same_size && idx == this->d.a.num_values) {
+            return this->insert_at_array_end<true>(value);
+        }
+        this->convert_from_array_to_tree();
+    }
+    // Is a tree.
+    paranoid_invariant(!is_array);
+    if (!same_size) {
+        this->values_same_size = false;
+        this->value_length = 0;
+    }
+
+    this->maybe_resize_tree(&value);
+    subtree *rebalance_subtree = nullptr;
+    this->insert_internal(&this->d.t.root, value, idx, &rebalance_subtree);
+    if (rebalance_subtree != nullptr) {
+        this->rebalance(rebalance_subtree);
+    }
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<bool with_resize>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::insert_at_array_end(const dmtwriter_t& value_in) {
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+    if (this->d.a.num_values == 0) {
+        this->value_length = value_in.get_size();
+    }
+    paranoid_invariant(this->value_length == value_in.get_size());
+
+    if (with_resize) {
+        this->maybe_resize_array_for_insert();
+    }
+    dmtdata_t *dest = this->alloc_array_value_end();
+    value_in.write_to(dest);
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+dmtdata_t * dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::alloc_array_value_end(void) {
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+    this->d.a.num_values++;
+
+    void *ptr = toku_mempool_malloc(&this->mp, align(this->value_length));
+    paranoid_invariant_notnull(ptr);
+    paranoid_invariant(reinterpret_cast<size_t>(ptr) % ALIGNMENT == 0);
+    dmtdata_t *CAST_FROM_VOIDP(n, ptr);
+    paranoid_invariant(n == get_array_value(this->d.a.num_values - 1));
+    return n;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+dmtdata_t * dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::get_array_value(const uint32_t idx) const {
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+
+    paranoid_invariant(idx < this->d.a.num_values);
+    return get_array_value_internal(&this->mp, idx);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+dmtdata_t * dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::get_array_value_internal(const struct mempool *mempool, const uint32_t idx) const {
+    void* ptr = toku_mempool_get_pointer_from_base_and_offset(mempool, idx * align(this->value_length));
+    dmtdata_t *CAST_FROM_VOIDP(value, ptr);
+    return value;
+}
+
+//TODO(leif) write microbenchmarks to compare growth factor.  Note:  growth factor here is actually 2.5 because of mempool_construct
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::maybe_resize_array_for_insert(void) {
+    bool space_available = toku_mempool_get_free_size(&this->mp) >= align(this->value_length);
+
+    if (!space_available) {
+        const uint32_t n = this->d.a.num_values + 1;
+        const uint32_t new_n = n <=2 ? 4 : 2*n;
+        const uint32_t new_space = align(this->value_length) * new_n;
+
+        struct mempool new_kvspace;
+        toku_mempool_construct(&new_kvspace, new_space);
+        size_t copy_bytes = this->d.a.num_values * align(this->value_length);
+        invariant(copy_bytes + align(this->value_length) <= new_space);
+        paranoid_invariant(copy_bytes <= toku_mempool_get_used_size(&this->mp));
+        // Copy over to new mempool
+        if (this->d.a.num_values > 0) {
+            void* dest = toku_mempool_malloc(&new_kvspace, copy_bytes);
+            invariant(dest!=nullptr);
+            memcpy(dest, get_array_value(0), copy_bytes);
+        }
+        toku_mempool_destroy(&this->mp);
+        this->mp = new_kvspace;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::align(const uint32_t x) const {
+    return roundup_to_multiple(ALIGNMENT, x);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::prepare_for_serialize(void) {
+    if (!this->is_array) {
+        this->convert_from_tree_to_array();
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::convert_from_tree_to_array(void) {
+    paranoid_invariant(!this->is_array);
+    paranoid_invariant(this->values_same_size);
+    
+    const uint32_t num_values = this->size();
+
+    node_offset *tmp_array;
+    bool malloced = false;
+    tmp_array = alloc_temp_node_offsets(num_values);
+    if (!tmp_array) {
+        malloced = true;
+        XMALLOC_N(num_values, tmp_array);
+    }
+    this->fill_array_with_subtree_offsets(tmp_array, this->d.t.root);
+
+    struct mempool new_mp;
+    const uint32_t fixed_len = this->value_length;
+    const uint32_t fixed_aligned_len = align(this->value_length);
+    size_t mem_needed = num_values * fixed_aligned_len;
+    toku_mempool_construct(&new_mp, mem_needed);
+    uint8_t* CAST_FROM_VOIDP(dest, toku_mempool_malloc(&new_mp, mem_needed));
+    paranoid_invariant_notnull(dest);
+    for (uint32_t i = 0; i < num_values; i++) {
+        const dmt_node &n = get_node(tmp_array[i]);
+        memcpy(&dest[i*fixed_aligned_len], &n.value, fixed_len);
+    }
+    toku_mempool_destroy(&this->mp);
+    this->mp = new_mp;
+    this->is_array = true;
+    this->d.a.num_values = num_values;
+
+    if (malloced) toku_free(tmp_array);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::convert_from_array_to_tree(void) {
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+    
+    //save array-format information to locals
+    const uint32_t num_values = this->d.a.num_values;
+
+    node_offset *tmp_array;
+    bool malloced = false;
+    tmp_array = alloc_temp_node_offsets(num_values);
+    if (!tmp_array) {
+        malloced = true;
+        XMALLOC_N(num_values, tmp_array);
+    }
+
+    struct mempool old_mp = this->mp;
+    size_t mem_needed = num_values * align(this->value_length + __builtin_offsetof(dmt_node, value));
+    toku_mempool_construct(&this->mp, mem_needed);
+
+    for (uint32_t i = 0; i < num_values; i++) {
+        dmtwriter_t writer(this->value_length, get_array_value_internal(&old_mp, i));
+        tmp_array[i] = node_malloc_and_set_value(writer);
+    }
+    this->is_array = false;
+    this->rebuild_subtree_from_offsets(&this->d.t.root, tmp_array, num_values);
+
+    if (malloced) toku_free(tmp_array);
+    toku_mempool_destroy(&old_mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::delete_at(const uint32_t idx) {
+    uint32_t n = this->size();
+    if (idx >= n) { return EINVAL; }
+
+    if (n == 1) {
+        this->clear();  //Emptying out the entire dmt.
+        return 0;
+    }
+    if (this->is_array) {
+        this->convert_from_array_to_tree();
+    }
+    paranoid_invariant(!is_array);
+
+    subtree *rebalance_subtree = nullptr;
+    this->delete_internal(&this->d.t.root, idx, nullptr, &rebalance_subtree);
+    if (rebalance_subtree != nullptr) {
+        this->rebalance(rebalance_subtree);
+    }
+    this->maybe_resize_tree(nullptr);
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::iterate(iterate_extra_t *const iterate_extra) const {
+    return this->iterate_on_range<iterate_extra_t, f>(0, this->size(), iterate_extra);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const {
+    if (right > this->size()) { return EINVAL; }
+    if (left == right) { return 0; }
+    if (this->is_array) {
+        return this->iterate_internal_array<iterate_extra_t, f>(left, right, iterate_extra);
+    }
+    return this->iterate_internal<iterate_extra_t, f>(left, right, this->d.t.root, 0, iterate_extra);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::verify(void) const {
+    uint32_t num_values = this->size();
+    invariant(num_values < UINT32_MAX);
+    size_t pool_used = toku_mempool_get_used_size(&this->mp);
+    size_t pool_size = toku_mempool_get_size(&this->mp);
+    size_t pool_frag = toku_mempool_get_frag_size(&this->mp);
+    invariant(pool_used <= pool_size);
+    if (this->is_array) {
+        invariant(this->values_same_size);
+        invariant(num_values == this->d.a.num_values);
+
+        // We know exactly how much memory should be used.
+        invariant(pool_used == num_values * align(this->value_length));
+
+        // Array form must have 0 fragmentation in mempool.
+        invariant(pool_frag == 0);
+    } else {
+        if (this->values_same_size) {
+            // We know exactly how much memory should be used.
+            invariant(pool_used == num_values * align(this->value_length + __builtin_offsetof(dmt_node, value)));
+        } else {
+            // We can only do a lower bound on memory usage.
+            invariant(pool_used >= num_values * __builtin_offsetof(dmt_node, value));
+        }
+        std::vector<bool> touched(pool_size, false);
+        verify_internal(this->d.t.root, &touched);
+        size_t bytes_used = 0;
+        for (size_t i = 0; i < pool_size; i++) {
+            if (touched.at(i)) {
+                ++bytes_used;
+            }
+        }
+        invariant(bytes_used == pool_used);
+    }
+}
+
+// Verifies all weights are internally consistent.
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::verify_internal(const subtree &subtree, std::vector<bool> *touched) const {
+    if (subtree.is_null()) {
+        return;
+    }
+    const dmt_node &node = get_node(subtree);
+
+    if (this->values_same_size) {
+        invariant(node.value_length == this->value_length);
+    }
+
+    size_t offset = toku_mempool_get_offset_from_pointer_and_base(&this->mp, &node);
+    size_t node_size = align(__builtin_offsetof(dmt_node, value) + node.value_length);
+    invariant(offset <= touched->size());
+    invariant(offset+node_size <= touched->size());
+    invariant(offset % ALIGNMENT == 0);
+    // Mark memory as touched and never allocated to multiple nodes.
+    for (size_t i = offset; i < offset+node_size; ++i) {
+        invariant(!touched->at(i));
+        touched->at(i) = true;
+    }
+
+    const uint32_t leftweight = this->nweight(node.left);
+    const uint32_t rightweight = this->nweight(node.right);
+
+    invariant(leftweight + rightweight + 1 == this->nweight(subtree));
+    verify_internal(node.left, touched);
+    verify_internal(node.right, touched);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::iterate_ptr(iterate_extra_t *const iterate_extra) {
+    if (this->is_array) {
+        this->iterate_ptr_internal_array<iterate_extra_t, f>(0, this->size(), iterate_extra);
+    } else {
+        this->iterate_ptr_internal<iterate_extra_t, f>(0, this->size(), this->d.t.root, 0, iterate_extra);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::fetch(const uint32_t idx, uint32_t *const value_len, dmtdataout_t *const value) const {
+    if (idx >= this->size()) { return EINVAL; }
+    if (this->is_array) {
+        this->fetch_internal_array(idx, value_len, value);
+    } else {
+        this->fetch_internal(this->d.t.root, idx, value_len, value);
+    }
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::find_zero(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    uint32_t tmp_index;
+    uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+    int r;
+    if (this->is_array) {
+        r = this->find_internal_zero_array<dmtcmp_t, h>(extra, value_len, value, child_idxp);
+    }
+    else {
+        r = this->find_internal_zero<dmtcmp_t, h>(this->d.t.root, extra, value_len, value, child_idxp);
+    }
+    return r;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::find(const dmtcmp_t &extra, int direction, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    uint32_t tmp_index;
+    uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+    paranoid_invariant(direction != 0);
+    if (direction < 0) {
+        if (this->is_array) {
+            return this->find_internal_minus_array<dmtcmp_t, h>(extra, value_len,  value, child_idxp);
+        } else {
+            return this->find_internal_minus<dmtcmp_t, h>(this->d.t.root, extra, value_len,  value, child_idxp);
+        }
+    } else {
+        if (this->is_array) {
+            return this->find_internal_plus_array<dmtcmp_t, h>(extra, value_len,  value, child_idxp);
+        } else {
+            return this->find_internal_plus<dmtcmp_t, h>(this->d.t.root, extra, value_len, value, child_idxp);
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+size_t dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::memory_size(void) {
+    return (sizeof *this) + toku_mempool_get_size(&this->mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+dmt_node_templated<dmtdata_t> & dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::get_node(const subtree &subtree) const {
+    paranoid_invariant(!subtree.is_null());
+    return get_node(subtree.get_offset());
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+dmt_node_templated<dmtdata_t> & dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::get_node(const node_offset offset) const {
+    void* ptr = toku_mempool_get_pointer_from_base_and_offset(&this->mp, offset);
+    dmt_node *CAST_FROM_VOIDP(node, ptr);
+    return *node;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::node_set_value(dmt_node * n, const dmtwriter_t &value) {
+    n->value_length = value.get_size();
+    value.write_to(&n->value);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+node_offset dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::node_malloc_and_set_value(const dmtwriter_t &value) {
+    size_t val_size = value.get_size();
+    size_t size_to_alloc = __builtin_offsetof(dmt_node, value) + val_size;
+    size_to_alloc = align(size_to_alloc);
+    void* np = toku_mempool_malloc(&this->mp, size_to_alloc);
+    paranoid_invariant_notnull(np);
+    dmt_node *CAST_FROM_VOIDP(n, np);
+    node_set_value(n, value);
+
+    return toku_mempool_get_offset_from_pointer_and_base(&this->mp, np);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::node_free(const subtree &st) {
+    dmt_node &n = get_node(st);
+    size_t size_to_free = __builtin_offsetof(dmt_node, value) + n.value_length;
+    size_to_free = align(size_to_free);
+    toku_mempool_mfree(&this->mp, &n, size_to_free);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::maybe_resize_tree(const dmtwriter_t * value) {
+    const ssize_t curr_capacity = toku_mempool_get_size(&this->mp);
+    const ssize_t curr_free = toku_mempool_get_free_size(&this->mp);
+    const ssize_t curr_used = toku_mempool_get_used_size(&this->mp);
+    ssize_t add_size = 0;
+    if (value) {
+        add_size = __builtin_offsetof(dmt_node, value) + value->get_size();
+        add_size = align(add_size);
+    }
+
+    const ssize_t need_size = curr_used + add_size;
+    paranoid_invariant(need_size <= UINT32_MAX);
+    //TODO(leif) consider different growth rates
+    const ssize_t new_size = 2*need_size;
+    paranoid_invariant(new_size <= UINT32_MAX);
+
+    if ((curr_capacity / 2 >= new_size) || // Way too much allocated
+        (curr_free < add_size)) {  // No room in mempool
+        // Copy all memory and reconstruct dmt in new mempool.
+        if (curr_free < add_size && toku_mempool_get_frag_size(&this->mp) == 0) {
+            // TODO(yoni) or TODO(leif) consider doing this not just when frag size is zero, but also when it is a small percentage of the total mempool size
+            // Offsets remain the same in the new mempool so we can just realloc.
+            toku_mempool_realloc_larger(&this->mp, new_size);
+        } else if (!this->d.t.root.is_null()) {
+            struct mempool new_kvspace;
+            toku_mempool_construct(&new_kvspace, new_size);
+
+            const dmt_node &n = get_node(this->d.t.root);
+            node_offset *tmp_array;
+            bool malloced = false;
+            tmp_array = alloc_temp_node_offsets(n.weight);
+            if (!tmp_array) {
+                malloced = true;
+                XMALLOC_N(n.weight, tmp_array);
+            }
+            this->fill_array_with_subtree_offsets(tmp_array, this->d.t.root);
+            for (node_offset i = 0; i < n.weight; i++) {
+                dmt_node &node = get_node(tmp_array[i]);
+                const size_t bytes_to_copy = __builtin_offsetof(dmt_node, value) + node.value_length;
+                const size_t bytes_to_alloc = align(bytes_to_copy);
+                void* newdata = toku_mempool_malloc(&new_kvspace, bytes_to_alloc);
+                memcpy(newdata, &node, bytes_to_copy);
+                tmp_array[i] = toku_mempool_get_offset_from_pointer_and_base(&new_kvspace, newdata);
+            }
+
+            struct mempool old_kvspace = this->mp;
+            this->mp = new_kvspace;
+            this->rebuild_subtree_from_offsets(&this->d.t.root, tmp_array, n.weight);
+            if (malloced) toku_free(tmp_array);
+            toku_mempool_destroy(&old_kvspace);
+        } else {
+            toku_mempool_destroy(&this->mp);
+            toku_mempool_construct(&this->mp, new_size);
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+bool dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::will_need_rebalance(const subtree &subtree, const int leftmod, const int rightmod) const {
+    if (subtree.is_null()) { return false; }
+    const dmt_node &n = get_node(subtree);
+    // one of the 1's is for the root.
+    // the other is to take ceil(n/2)
+    const uint32_t weight_left  = this->nweight(n.left)  + leftmod;
+    const uint32_t weight_right = this->nweight(n.right) + rightmod;
+    return ((1+weight_left < (1+1+weight_right)/2)
+            ||
+            (1+weight_right < (1+1+weight_left)/2));
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::insert_internal(subtree *const subtreep, const dmtwriter_t &value, const uint32_t idx, subtree **const rebalance_subtree) {
+    if (subtreep->is_null()) {
+        paranoid_invariant_zero(idx);
+        const node_offset newoffset = this->node_malloc_and_set_value(value);
+        dmt_node &newnode = get_node(newoffset);
+        newnode.weight = 1;
+        newnode.left.set_to_null();
+        newnode.right.set_to_null();
+        subtreep->set_offset(newoffset);
+    } else {
+        dmt_node &n = get_node(*subtreep);
+        n.weight++;
+        if (idx <= this->nweight(n.left)) {
+            if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, 1, 0)) {
+                *rebalance_subtree = subtreep;
+            }
+            this->insert_internal(&n.left, value, idx, rebalance_subtree);
+        } else {
+            if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, 0, 1)) {
+                *rebalance_subtree = subtreep;
+            }
+            const uint32_t sub_index = idx - this->nweight(n.left) - 1;
+            this->insert_internal(&n.right, value, sub_index, rebalance_subtree);
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::delete_internal(subtree *const subtreep, const uint32_t idx, subtree *const subtree_replace, subtree **const rebalance_subtree) {
+    paranoid_invariant_notnull(subtreep);
+    paranoid_invariant_notnull(rebalance_subtree);
+    paranoid_invariant(!subtreep->is_null());
+    dmt_node &n = get_node(*subtreep);
+    const uint32_t leftweight = this->nweight(n.left);
+    if (idx < leftweight) {
+        n.weight--;
+        if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, -1, 0)) {
+            *rebalance_subtree = subtreep;
+        }
+        this->delete_internal(&n.left, idx, subtree_replace, rebalance_subtree);
+    } else if (idx == leftweight) {
+        // Found the correct index.
+        if (n.left.is_null()) {
+            paranoid_invariant_zero(idx);
+            // Delete n and let parent point to n.right
+            subtree ptr_this = *subtreep;
+            *subtreep = n.right;
+            subtree to_free;
+            if (subtree_replace != nullptr) {
+                // Swap self with the other node.  Taking over all responsibility.
+                to_free = *subtree_replace;
+                dmt_node &ancestor = get_node(*subtree_replace);
+                if (*rebalance_subtree == &ancestor.right) {
+                    // Take over rebalance responsibility.
+                    *rebalance_subtree = &n.right;
+                }
+                n.weight = ancestor.weight;
+                n.left = ancestor.left;
+                n.right = ancestor.right;
+                *subtree_replace = ptr_this;
+            } else {
+                to_free = ptr_this;
+            }
+            this->node_free(to_free);
+        } else if (n.right.is_null()) {
+            // Delete n and let parent point to n.left
+            subtree to_free = *subtreep;
+            *subtreep = n.left;
+            paranoid_invariant(idx>0);
+            paranoid_invariant_null(subtree_replace);  // To be recursive, we're looking for index 0.  n is index > 0 here.
+            this->node_free(to_free);
+        } else {
+            if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, 0, -1)) {
+                *rebalance_subtree = subtreep;
+            }
+            // don't need to copy up value, it's only used by this
+            // next call, and when that gets to the bottom there
+            // won't be any more recursion
+            n.weight--;
+            this->delete_internal(&n.right, 0, subtreep, rebalance_subtree);
+        }
+    } else {
+        n.weight--;
+        if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, 0, -1)) {
+            *rebalance_subtree = subtreep;
+        }
+        this->delete_internal(&n.right, idx - leftweight - 1, subtree_replace, rebalance_subtree);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::iterate_internal_array(const uint32_t left, const uint32_t right,
+                                                         iterate_extra_t *const iterate_extra) const {
+    int r;
+    for (uint32_t i = left; i < right; ++i) {
+        r = f(this->value_length, *get_array_value(i), i, iterate_extra);
+        if (r != 0) {
+            return r;
+        }
+    }
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::iterate_ptr_internal(const uint32_t left, const uint32_t right,
+                                                        const subtree &subtree, const uint32_t idx,
+                                                        iterate_extra_t *const iterate_extra) {
+    if (!subtree.is_null()) { 
+        dmt_node &n = get_node(subtree);
+        const uint32_t idx_root = idx + this->nweight(n.left);
+        if (left < idx_root) {
+            this->iterate_ptr_internal<iterate_extra_t, f>(left, right, n.left, idx, iterate_extra);
+        }
+        if (left <= idx_root && idx_root < right) {
+            int r = f(n.value_length, &n.value, idx_root, iterate_extra);
+            lazy_assert_zero(r);
+        }
+        if (idx_root + 1 < right) {
+            this->iterate_ptr_internal<iterate_extra_t, f>(left, right, n.right, idx_root + 1, iterate_extra);
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+                                                              iterate_extra_t *const iterate_extra) {
+    for (uint32_t i = left; i < right; ++i) {
+        int r = f(this->value_length, get_array_value(i), i, iterate_extra);
+        lazy_assert_zero(r);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::iterate_internal(const uint32_t left, const uint32_t right,
+                                                   const subtree &subtree, const uint32_t idx,
+                                                   iterate_extra_t *const iterate_extra) const {
+    if (subtree.is_null()) { return 0; }
+    int r;
+    const dmt_node &n = get_node(subtree);
+    const uint32_t idx_root = idx + this->nweight(n.left);
+    if (left < idx_root) {
+        r = this->iterate_internal<iterate_extra_t, f>(left, right, n.left, idx, iterate_extra);
+        if (r != 0) { return r; }
+    }
+    if (left <= idx_root && idx_root < right) {
+        r = f(n.value_length, n.value, idx_root, iterate_extra);
+        if (r != 0) { return r; }
+    }
+    if (idx_root + 1 < right) {
+        return this->iterate_internal<iterate_extra_t, f>(left, right, n.right, idx_root + 1, iterate_extra);
+    }
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::fetch_internal_array(const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const {
+    copyout(value_len, value, this->value_length, get_array_value(i));
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::fetch_internal(const subtree &subtree, const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const {
+    dmt_node &n = get_node(subtree);
+    const uint32_t leftweight = this->nweight(n.left);
+    if (i < leftweight) {
+        this->fetch_internal(n.left, i, value_len, value);
+    } else if (i == leftweight) {
+        copyout(value_len, value, &n);
+    } else {
+        this->fetch_internal(n.right, i - leftweight - 1, value_len, value);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::fill_array_with_subtree_offsets(node_offset *const array, const subtree &subtree) const {
+    if (!subtree.is_null()) {
+        const dmt_node &tree = get_node(subtree);
+        this->fill_array_with_subtree_offsets(&array[0], tree.left);
+        array[this->nweight(tree.left)] = subtree.get_offset();
+        this->fill_array_with_subtree_offsets(&array[this->nweight(tree.left) + 1], tree.right);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::rebuild_subtree_from_offsets(subtree *const subtree, const node_offset *const offsets, const uint32_t numvalues) {
+    if (numvalues==0) {
+        subtree->set_to_null();
+    } else {
+        uint32_t halfway = numvalues/2;
+        subtree->set_offset(offsets[halfway]);
+        dmt_node &newnode = get_node(offsets[halfway]);
+        newnode.weight = numvalues;
+        // value is already in there.
+        this->rebuild_subtree_from_offsets(&newnode.left,  &offsets[0], halfway);
+        this->rebuild_subtree_from_offsets(&newnode.right, &offsets[halfway+1], numvalues-(halfway+1));
+    }
+}
+
+//TODO(leif): Note that this can mess with our memory_footprint calculation (we may touch past what is marked as 'used' in the mempool)
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+node_offset* dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::alloc_temp_node_offsets(uint32_t num_offsets) {
+    size_t mem_needed = num_offsets * sizeof(node_offset);
+    size_t mem_free;
+    mem_free = toku_mempool_get_free_size(&this->mp);
+    node_offset* CAST_FROM_VOIDP(tmp, toku_mempool_get_next_free_ptr(&this->mp));
+    if (mem_free >= mem_needed) {
+        return tmp;
+    }
+    return nullptr;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::rebalance(subtree *const subtree) {
+    paranoid_invariant(!subtree->is_null());
+
+    // There is a possible "optimization" here:
+    //   if (this->values_same_size && subtree == &this->d.t.root) {
+    //       this->convert_from_tree_to_array();
+    //       return;
+    //   }
+    // but we don't want to do it because it involves actually copying values around
+    // as opposed to stopping in the middle of rebalancing (like in the OMT)
+
+    node_offset offset = subtree->get_offset();
+    const dmt_node &n = get_node(offset);
+    node_offset *tmp_array;
+    bool malloced = false;
+    tmp_array = alloc_temp_node_offsets(n.weight);
+    if (!tmp_array) {
+        malloced = true;
+        XMALLOC_N(n.weight, tmp_array);
+    }
+    this->fill_array_with_subtree_offsets(tmp_array, *subtree);
+    this->rebuild_subtree_from_offsets(subtree, tmp_array, n.weight);
+    if (malloced) toku_free(tmp_array);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::copyout(uint32_t *const outlen, dmtdata_t *const out, const dmt_node *const n) {
+    if (outlen) {
+        *outlen = n->value_length;
+    }
+    if (out) {
+        *out = n->value;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::copyout(uint32_t *const outlen, dmtdata_t **const out, dmt_node *const n) {
+    if (outlen) {
+        *outlen = n->value_length;
+    }
+    if (out) {
+        *out = &n->value;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::copyout(uint32_t *const outlen, dmtdata_t *const out, const uint32_t len, const dmtdata_t *const stored_value_ptr) {
+    if (outlen) {
+        *outlen = len;
+    }
+    if (out) {
+        *out = *stored_value_ptr;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::copyout(uint32_t *const outlen, dmtdata_t **const out, const uint32_t len, dmtdata_t *const stored_value_ptr) {
+    if (outlen) {
+        *outlen = len;
+    }
+    if (out) {
+        *out = stored_value_ptr;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::find_internal_zero_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    uint32_t min = 0;
+    uint32_t limit = this->d.a.num_values;
+    uint32_t best_pos = subtree::NODE_NULL;
+    uint32_t best_zero = subtree::NODE_NULL;
+
+    while (min!=limit) {
+        uint32_t mid = (min + limit) / 2;
+        int hv = h(this->value_length, *get_array_value(mid), extra);
+        if (hv<0) {
+            min = mid+1;
+        }
+        else if (hv>0) {
+            best_pos  = mid;
+            limit     = mid;
+        }
+        else {
+            best_zero = mid;
+            limit     = mid;
+        }
+    }
+    if (best_zero!=subtree::NODE_NULL) {
+        //Found a zero
+        copyout(value_len, value, this->value_length, get_array_value(best_zero));
+        *idxp = best_zero;
+        return 0;
+    }
+    if (best_pos!=subtree::NODE_NULL) *idxp = best_pos;
+    else                     *idxp = this->d.a.num_values;
+    return DB_NOTFOUND;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::find_internal_zero(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    if (subtree.is_null()) {
+        *idxp = 0;
+        return DB_NOTFOUND;
+    }
+    dmt_node &n = get_node(subtree);
+    int hv = h(n.value_length, n.value, extra);
+    if (hv<0) {
+        int r = this->find_internal_zero<dmtcmp_t, h>(n.right, extra, value_len, value, idxp);
+        *idxp += this->nweight(n.left)+1;
+        return r;
+    } else if (hv>0) {
+        return this->find_internal_zero<dmtcmp_t, h>(n.left, extra, value_len, value, idxp);
+    } else {
+        int r = this->find_internal_zero<dmtcmp_t, h>(n.left, extra, value_len, value, idxp);
+        if (r==DB_NOTFOUND) {
+            *idxp = this->nweight(n.left);
+            copyout(value_len, value, &n);
+            r = 0;
+        }
+        return r;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::find_internal_plus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    uint32_t min = 0;
+    uint32_t limit = this->d.a.num_values;
+    uint32_t best = subtree::NODE_NULL;
+
+    while (min != limit) {
+        const uint32_t mid = (min + limit) / 2;
+        const int hv = h(this->value_length, *get_array_value(mid), extra);
+        if (hv > 0) {
+            best = mid;
+            limit = mid;
+        } else {
+            min = mid + 1;
+        }
+    }
+    if (best == subtree::NODE_NULL) { return DB_NOTFOUND; }
+    copyout(value_len, value, this->value_length, get_array_value(best));
+    *idxp = best;
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::find_internal_plus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    if (subtree.is_null()) {
+        return DB_NOTFOUND;
+    }
+    dmt_node & n = get_node(subtree);
+    int hv = h(n.value_length, n.value, extra);
+    int r;
+    if (hv > 0) {
+        r = this->find_internal_plus<dmtcmp_t, h>(n.left, extra, value_len, value, idxp);
+        if (r == DB_NOTFOUND) {
+            *idxp = this->nweight(n.left);
+            copyout(value_len, value, &n);
+            r = 0;
+        }
+    } else {
+        r = this->find_internal_plus<dmtcmp_t, h>(n.right, extra, value_len, value, idxp);
+        if (r == 0) {
+            *idxp += this->nweight(n.left) + 1;
+        }
+    }
+    return r;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::find_internal_minus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    uint32_t min = 0;
+    uint32_t limit = this->d.a.num_values;
+    uint32_t best = subtree::NODE_NULL;
+
+    while (min != limit) {
+        const uint32_t mid = (min + limit) / 2;
+        const int hv = h(this->value_length, *get_array_value(mid), extra);
+        if (hv < 0) {
+            best = mid;
+            min = mid + 1;
+        } else {
+            limit = mid;
+        }
+    }
+    if (best == subtree::NODE_NULL) { return DB_NOTFOUND; }
+    copyout(value_len, value, this->value_length, get_array_value(best));
+    *idxp = best;
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::find_internal_minus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    if (subtree.is_null()) {
+        return DB_NOTFOUND;
+    }
+    dmt_node & n = get_node(subtree);
+    int hv = h(n.value_length, n.value, extra);
+    if (hv < 0) {
+        int r = this->find_internal_minus<dmtcmp_t, h>(n.right, extra, value_len, value, idxp);
+        if (r == 0) {
+            *idxp += this->nweight(n.left) + 1;
+        } else if (r == DB_NOTFOUND) {
+            *idxp = this->nweight(n.left);
+            copyout(value_len, value, &n);
+            r = 0;
+        }
+        return r;
+    } else {
+        return this->find_internal_minus<dmtcmp_t, h>(n.left, extra, value_len, value, idxp);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::get_fixed_length(void) const {
+    return this->values_same_size ? this->value_length : 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::get_fixed_length_alignment_overhead(void) const {
+    return this->values_same_size ? align(this->value_length) - this->value_length : 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+bool dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::value_length_is_fixed(void) const {
+    return this->values_same_size;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::serialize_values(uint32_t expected_unpadded_memory, struct wbuf *wb) const {
+    invariant(this->is_array);
+    invariant(this->values_same_size);
+    const uint8_t pad_bytes = get_fixed_length_alignment_overhead();
+    const uint32_t fixed_len = this->value_length;
+    const uint32_t fixed_aligned_len = align(this->value_length);
+    paranoid_invariant(expected_unpadded_memory == this->d.a.num_values * this->value_length);
+    paranoid_invariant(toku_mempool_get_used_size(&this->mp) >=
+                       expected_unpadded_memory + pad_bytes * this->d.a.num_values);
+    if (this->d.a.num_values == 0) {
+        // Nothing to serialize
+    } else if (pad_bytes == 0) {
+        // Basically a memcpy
+        wbuf_nocrc_literal_bytes(wb, get_array_value(0), expected_unpadded_memory);
+    } else {
+        uint8_t* const dest = wbuf_nocrc_reserve_literal_bytes(wb, expected_unpadded_memory);
+        const uint8_t* const src = reinterpret_cast<uint8_t*>(get_array_value(0));
+        //TODO(leif) maybe look at vectorization here
+        for (uint32_t i = 0; i < this->d.a.num_values; i++) {
+            memcpy(&dest[i*fixed_len], &src[i*fixed_aligned_len], fixed_len);
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::builder::create(uint32_t _max_values, uint32_t _max_value_bytes) {
+    this->max_values = _max_values;
+    this->max_value_bytes = _max_value_bytes;
+    this->temp.create();
+    paranoid_invariant_null(toku_mempool_get_base(&this->temp.mp));
+    this->temp_valid = true;
+    this->sorted_node_offsets = nullptr;
+    // Include enough space for alignment padding
+    size_t initial_space = (ALIGNMENT - 1) * _max_values + _max_value_bytes;
+
+    toku_mempool_construct(&this->temp.mp, initial_space);  // Adds 25%
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::builder::append(const dmtwriter_t &value) {
+    paranoid_invariant(this->temp_valid);
+    //NOTE: Always use d.a.num_values for size because we have not yet created root.
+    if (this->temp.values_same_size && (this->temp.d.a.num_values == 0 || value.get_size() == this->temp.value_length)) {
+        temp.insert_at_array_end<false>(value);
+        return;
+    }
+    if (this->temp.is_array) {
+        // Convert to tree format (without weights and linkage)
+        XMALLOC_N(this->max_values, this->sorted_node_offsets);
+
+        // Include enough space for alignment padding
+        size_t mem_needed = (ALIGNMENT - 1 + __builtin_offsetof(dmt_node, value)) * max_values + max_value_bytes;
+        struct mempool old_mp = this->temp.mp;
+
+        const uint32_t num_values = this->temp.d.a.num_values;
+        toku_mempool_construct(&this->temp.mp, mem_needed);
+
+        // Copy over and get node_offsets
+        for (uint32_t i = 0; i < num_values; i++) {
+            dmtwriter_t writer(this->temp.value_length, this->temp.get_array_value_internal(&old_mp, i));
+            this->sorted_node_offsets[i] = this->temp.node_malloc_and_set_value(writer);
+        }
+        this->temp.is_array = false;
+        this->temp.values_same_size = false;
+        this->temp.value_length = 0;
+        toku_mempool_destroy(&old_mp);
+    }
+    paranoid_invariant(!this->temp.is_array);
+    this->sorted_node_offsets[this->temp.d.a.num_values++] = this->temp.node_malloc_and_set_value(value);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+bool dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::builder::value_length_is_fixed(void) {
+    paranoid_invariant(this->temp_valid);
+    return this->temp.values_same_size;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t, typename dmtwriter_t>
+void dmt<dmtdata_t, dmtdataout_t, dmtwriter_t>::builder::build(dmt<dmtdata_t, dmtdataout_t, dmtwriter_t> *dest) {
+    invariant(this->temp_valid);
+    //NOTE: Always use d.a.num_values for size because we have not yet created root.
+    invariant(this->temp.d.a.num_values <= this->max_values);
+    // Memory invariant is taken care of incrementally (during append())
+
+    if (!this->temp.is_array) {
+        invariant_notnull(this->sorted_node_offsets);
+        this->temp.rebuild_subtree_from_offsets(&this->temp.d.t.root, this->sorted_node_offsets, this->temp.d.a.num_values);
+        toku_free(this->sorted_node_offsets);
+        this->sorted_node_offsets = nullptr;
+    }
+    paranoid_invariant_null(this->sorted_node_offsets);
+
+    const size_t used = toku_mempool_get_used_size(&this->temp.mp);
+    const size_t allocated = toku_mempool_get_size(&this->temp.mp);
+    // We want to use no more than (about) the actual used space + 25% overhead for mempool growth.
+    // When we know the elements are fixed-length, we use the better dmt constructor.
+    // In practice, as of Jan 2014, we use the builder in two cases:
+    //  - When we know the elements are not fixed-length.
+    //  - During upgrade of a pre version 26 basement node.
+    // During upgrade, we will probably wildly overallocate because we don't account for the values that aren't stored in the dmt, so here we want to shrink the mempool.
+    // When we know the elements are not fixed-length, we still know how much memory they occupy in total, modulo alignment, so we want to allow for mempool overhead and worst-case alignment overhead, and not shrink the mempool.
+    const size_t max_allowed = used + (ALIGNMENT-1) * this->temp.size();
+    const size_t max_allowed_with_mempool_overhead = max_allowed + max_allowed / 4;
+    //TODO(leif): get footprint calculation correct (under jemalloc) and add some form of footprint constraint
+    if (allocated > max_allowed_with_mempool_overhead) {
+        // Reallocate smaller mempool to save memory
+        invariant_zero(toku_mempool_get_frag_size(&this->temp.mp));
+        struct mempool new_mp;
+        toku_mempool_construct(&new_mp, used);
+        void * newbase = toku_mempool_malloc(&new_mp, used);
+        invariant_notnull(newbase);
+        memcpy(newbase, toku_mempool_get_base(&this->temp.mp), used);
+        toku_mempool_destroy(&this->temp.mp);
+        this->temp.mp = new_mp;
+    }
+
+    *dest = this->temp;
+    this->temp_valid = false;
+
+}
+} // namespace toku
diff --git a/storage/tokudb/PerconaFT/util/dmt.h b/storage/tokudb/PerconaFT/util/dmt.h
new file mode 100644
index 00000000..99be296d
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/dmt.h
@@ -0,0 +1,675 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <vector>
+
+#include "portability/memory.h"
+#include "portability/toku_portability.h"
+#include "portability/toku_race_tools.h"
+#include "portability/toku_stdint.h"
+
+#include "ft/serialize/wbuf.h"
+#include "util/growable_array.h"
+#include "util/mempool.h"
+
+namespace toku {
+typedef uint32_t node_offset;
+
+
+/**
+ * Dynamic Order Maintenance Tree (DMT)
+ *
+ * Maintains a collection of totally ordered values, where each value has weight 1.
+ * A DMT supports variable sized values.
+ * The DMT is a mutable datatype.
+ *
+ * The Abstraction:
+ *
+ * An DMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+ * The vector is numbered from $0$ to $|V|-1$.
+ *
+ * We can create a new DMT, which is the empty vector.
+ *
+ * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+ *  $|V'|=1+|V|$       and
+ *
+ *   V'_j = V_j       if $j<i$
+ *          x         if $j=i$
+ *          V_{j-1}   if $j>i$.
+ *
+ * We can specify $i$ using a kind of function instead of as an integer.
+ * Let $b$ be a function mapping from values to nonzero integers, such that
+ * the signum of $b$ is monotically increasing.
+ * We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+ *
+ * We look up a value using its index, or using a Heaviside function.
+ * For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing.
+ * When lookup up values, we can look up
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a special return code if no such value exists.)
+ *      (Rationale:  Ordinarily we want $i$ to be unique.  But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.)
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
+ *  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an indication that no such value exists.)
+ *
+ * When looking up a value using a Heaviside function, we get the value and its index.
+ *
+ * Performance:
+ *  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function.
+ *  The memory required is O(|V|).
+ *
+ * Usage:
+ *  The dmt is templated by three parameters:
+ *   - dmtdata_t is what will be stored within the dmt.  These could be pointers or real data types (ints, structs).
+ *   - dmtdataout_t is what will be returned by find and related functions.  By default, it is the same as dmtdata_t, but you can set it to (dmtdata_t *).
+ *   - dmtwriter_t is a class that effectively handles (de)serialization between the value stored in the dmt and outside the dmt.
+ *  To create an dmt which will store "TXNID"s, for example, it is a good idea to typedef the template:
+ *   typedef dmt<TXNID, TXNID, txnid_writer_t> txnid_dmt_t;
+ *  If you are storing structs (or you want to edit what is stored), you may want to be able to get a pointer to the data actually stored in the dmt (see find_zero).  To do this, use the second template parameter:
+ *   typedef dmt<struct foo, struct foo *, foo_writer_t> foo_dmt_t;
+ */
+
+namespace dmt_internal {
+
+class subtree {
+private:
+    uint32_t m_index;
+public:
+    // The maximum mempool size for a dmt is 2**32-2
+    static const uint32_t NODE_NULL = UINT32_MAX;
+    inline void set_to_null(void) {
+        m_index = NODE_NULL;
+    }
+
+    inline bool is_null(void) const {
+        return NODE_NULL == this->get_offset();
+    }
+
+    inline node_offset get_offset(void) const {
+        return m_index;
+    }
+
+    inline void set_offset(node_offset index) {
+        paranoid_invariant(index != NODE_NULL);
+        m_index = index;
+    }
+} __attribute__((__packed__,__aligned__(4)));
+
+template<typename dmtdata_t>
+class dmt_node_templated {
+public:
+    uint32_t weight;
+    subtree left;
+    subtree right;
+    uint32_t value_length;
+    dmtdata_t value;
+} __attribute__((__aligned__(4)));  //NOTE: we cannot use attribute packed or dmtdata_t will call copy constructors (dmtdata_t might not be packed by default)
+
+}
+
+using namespace toku::dmt_internal;
+
+// Each data type used in a dmt requires a dmt_writer class (allows you to insert/etc with dynamic sized types).
+// A dmt_writer can be thought of a (de)serializer
+// There is no default implementation.
+// A dmtwriter instance handles reading/writing 'dmtdata_t's to/from the dmt.
+// The class must implement the following functions:
+//      The size required in a dmt for the dmtdata_t represented:
+//          size_t get_size(void) const;
+//      Write the dmtdata_t to memory owned by a dmt:
+//          void write_to(dmtdata_t *const dest) const;
+//      Constructor (others are allowed, but this one is required)
+//          dmtwriter(const uint32_t dmtdata_t_len, dmtdata_t *const src)
+
+template<typename dmtdata_t,
+         typename dmtdataout_t,
+         typename dmtwriter_t
+        >
+class dmt {
+private:
+    typedef dmt_node_templated<dmtdata_t> dmt_node;
+
+public:
+    static const uint8_t ALIGNMENT = 4;
+
+    class builder {
+    public:
+        void append(const dmtwriter_t &value);
+
+        // Create a dmt builder to build a dmt that will have at most n_values values and use
+        // at most n_value_bytes bytes in the mempool to store values (not counting node or alignment overhead).
+        void create(uint32_t n_values, uint32_t n_value_bytes);
+
+        bool value_length_is_fixed(void);
+
+        // Constructs a dmt that contains everything that was append()ed to this builder.
+        // Destroys this builder and frees associated memory.
+        void build(dmt<dmtdata_t, dmtdataout_t, dmtwriter_t> *dest);
+    private:
+        uint32_t max_values;
+        uint32_t max_value_bytes;
+        node_offset *sorted_node_offsets;
+        bool temp_valid;
+        dmt<dmtdata_t, dmtdataout_t, dmtwriter_t> temp;
+    };
+
+    /**
+     * Effect: Create an empty DMT.
+     * Performance: constant time.
+     */
+    void create(void);
+
+    /**
+     * Effect: Create a DMT containing values.  The number of values is in numvalues.
+     *         Each value is of a fixed (at runtime) length.
+     *         mem contains the values in packed form (no alignment padding)
+     *         Caller retains ownership of mem.
+     * Requires: this has not been created yet
+     * Rationale:    Normally to insert N values takes O(N lg N) amortized time.
+     *               If the N values are known in advance, are sorted, and
+     *               the structure is empty, we can batch insert them much faster.
+     */
+    __attribute__((nonnull))
+    void create_from_sorted_memory_of_fixed_size_elements(
+            const void *mem,
+            const uint32_t numvalues,
+            const uint32_t mem_length,
+            const uint32_t fixed_value_length);
+
+    /**
+     * Effect: Creates a copy of an dmt.
+     *  Creates this as the clone.
+     *  Each element is copied directly.  If they are pointers, the underlying data is not duplicated.
+     * Performance: O(memory) (essentially a memdup)
+     *  The underlying structures are memcpy'd.  Only the values themselves are copied (shallow copy)
+     */
+    void clone(const dmt &src);
+
+    /**
+     * Effect: Set the tree to be empty.
+     *  Note: Will not reallocate or resize any memory.
+     *  Note: If this dmt had variable sized elements, it will start tracking again (until it gets values of two different sizes)
+     * Performance: time=O(1)
+     */
+    void clear(void);
+
+    /**
+     * Effect:  Destroy an DMT, freeing all its memory.
+     *   If the values being stored are pointers, their underlying data is not freed.
+     *   Those values may be freed before or after calling ::destroy()
+     * Rationale: Returns no values since free() cannot fail.
+     * Rationale: Does not free the underlying pointers to reduce complexity/maintain abstraction layer
+     * Performance:  time=O(1)
+     */
+    void destroy(void);
+
+    /**
+     * Effect: return |this| (number of values stored in this dmt).
+     * Performance:  time=O(1)
+     */
+    uint32_t size(void) const;
+
+    /**
+     * Effect: Serialize all values contained in this dmt into a packed form (no alignment padding).
+     *  We serialized to wb.  expected_unpadded_memory is the size of memory reserved in the wbuf
+     *  for serialization.  (We assert that serialization requires exactly the expected amount)
+     * Requires:
+     *  ::prepare_for_serialize() has been called and no non-const functions have been called since.
+     *  This dmt has fixed-length values and is in array form.
+     * Performance:
+     *  O(memory)
+     */
+    void serialize_values(uint32_t expected_unpadded_memory, struct wbuf *wb) const;
+
+    /**
+     * Effect:  Insert value into the DMT.
+     *   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+     *   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+     *      If no such i exists, then let i be |V|
+     *   Then this has the same effect as
+     *    insert_at(tree, value, i);
+     *   If idx!=NULL then i is stored in *idx
+     * Requires:  The signum of h must be monotonically increasing.
+     * Returns:
+     *    0            success
+     *    DB_KEYEXIST  the key is present (h was equal to zero for some value)
+     * On nonzero return, dmt is unchanged.
+     * Performance: time=O(\log N) amortized.
+     * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+     */
+    template<typename dmtcmp_t, int (*h)(const uint32_t size, const dmtdata_t &, const dmtcmp_t &)>
+    int insert(const dmtwriter_t &value, const dmtcmp_t &v, uint32_t *const idx);
+
+    /**
+     * Effect: Increases indexes of all items at slot >= idx by 1.
+     *         Insert value into the position at idx.
+     * Returns:
+     *   0         success
+     *   EINVAL    if idx > this->size()
+     * On error, dmt is unchanged.
+     * Performance: time=O(\log N) amortized time.
+     * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+     */
+    int insert_at(const dmtwriter_t &value, const uint32_t idx);
+
+    /**
+     * Effect: Delete the item in slot idx.
+     *         Decreases indexes of all items at slot > idx by 1.
+     * Returns
+     *     0            success
+     *     EINVAL       if idx>=this->size()
+     * On error, dmt is unchanged.
+     * Rationale: To delete an item, first find its index using find or find_zero, then delete it.
+     * Performance: time=O(\log N) amortized.
+     */
+    int delete_at(const uint32_t idx);
+
+    /**
+     * Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+     *  The first argument passed to f is a ref-to-const of the value stored in the dmt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     * Requires: f != NULL
+     * Returns:
+     *  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate.
+     *  If f always returns zero, then iterate returns 0.
+     * Requires:  Don't modify the dmt while running.  (E.g., f may not insert or delete values from the dmt.)
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the dmt.
+     * Rationale: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+     * Rationale: We may at some point use functors, but for now this is a smaller change from the old DMT.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate(iterate_extra_t *const iterate_extra) const;
+
+    /**
+     * Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+     *  The first argument passed to f is a ref-to-const of the value stored in the dmt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     *  We will iterate only over [left,right)
+     *
+     * Requires: left <= right
+     * Requires: f != NULL
+     * Returns:
+     *  EINVAL  if right > this->size()
+     *  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate_on_range.
+     *  If f always returns zero, then iterate_on_range returns 0.
+     * Requires:  Don't modify the dmt while running.  (E.g., f may not insert or delete values from the dmt.)
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the dmt.
+     * Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const;
+
+    // Attempt to verify this dmt is well formed.  (Crashes/asserts/aborts if not well formed)
+    void verify(void) const;
+
+    /**
+     * Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+     *  The first argument passed to f is a pointer to the value stored in the dmt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     * Requires: same as for iterate()
+     * Returns: same as for iterate()
+     * Performance: same as for iterate()
+     * Rationale: In general, most iterators should use iterate() since they should not modify the data stored in the dmt.  This function is for iterators which need to modify values (for example, free_items).
+     * Rationale: We assume if you are transforming the data in place, you want to do it to everything at once, so there is not yet an iterate_on_range_ptr (but there could be).
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr(iterate_extra_t *const iterate_extra);
+
+    /**
+     * Effect: Set *value=V_idx
+     * Returns
+     *    0             success
+     *    EINVAL        if index>=toku_dmt_size(dmt)
+     * On nonzero return, *value is unchanged
+     * Performance: time=O(\log N)
+     */
+    int fetch(const uint32_t idx, uint32_t *const value_size, dmtdataout_t *const value) const;
+
+    /**
+     * Effect:  Find the smallest i such that h(V_i, extra)>=0
+     *  If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value = V_i, and return 0.
+     *  If there is such an i and h(V_i,extra)>0  then set *idxp=i and return DB_NOTFOUND.
+     *  If there is no such i then set *idx=this->size() and return DB_NOTFOUND.
+     * Note: value is of type dmtdataout_t, which may be of type (dmtdata_t) or (dmtdata_t *) but is fixed by the instantiation.
+     *  If it is the value type, then the value is copied out (even if the value type is a pointer to something else)
+     *  If it is the pointer type, then *value is set to a pointer to the data within the dmt.
+     *  This is determined by the type of the dmt as initially declared.
+     *   If the dmt is declared as dmt<foo_t>, then foo_t's will be stored and foo_t's will be returned by find and related functions.
+     *   If the dmt is declared as dmt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the stored items will be returned by find and related functions.
+     * Rationale:
+     *  Structs too small for malloc should be stored directly in the dmt.
+     *  These structs may need to be edited as they exist inside the dmt, so we need a way to get a pointer within the dmt.
+     *  Using separate functions for returning pointers and values increases code duplication and reduces type-checking.
+     *  That also reduces the ability of the creator of a data structure to give advice to its future users.
+     *  Slight overloading in this case seemed to provide a better API and better type checking.
+     */
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_zero(const dmtcmp_t &extra, uint32_t *const value_size, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    /**
+     *   Effect:
+     *    If direction >0 then find the smallest i such that h(V_i,extra)>0.
+     *    If direction <0 then find the largest  i such that h(V_i,extra)<0.
+     *    (Direction may not be equal to zero.)
+     *    If value!=NULL then store V_i in *value
+     *    If idxp!=NULL then store i in *idxp.
+     *   Requires: The signum of h is monotically increasing.
+     *   Returns
+     *      0             success
+     *      DB_NOTFOUND   no such value is found.
+     *   On nonzero return, *value and *idxp are unchanged
+     *   Performance: time=O(\log N)
+     *   Rationale:
+     *     Here's how to use the find function to find various things
+     *       Cases for find:
+     *        find first value:         ( h(v)=+1, direction=+1 )
+     *        find last value           ( h(v)=-1, direction=-1 )
+     *        find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
+     *        find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
+     *        find X or successor to X  ( same as find first X. )
+     *
+     *   Rationale: To help understand heaviside functions and behavor of find:
+     *    There are 7 kinds of heaviside functions.
+     *    The signus of the h must be monotonically increasing.
+     *    Given a function of the following form, A is the element
+     *    returned for direction>0, B is the element returned
+     *    for direction<0, C is the element returned for
+     *    direction==0 (see find_zero) (with a return of 0), and D is the element
+     *    returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+     *    If any of A, B, or C are not found, then asking for the
+     *    associated direction will return DB_NOTFOUND.
+     *    See find_zero for more information.
+     *
+     *    Let the following represent the signus of the heaviside function.
+     *
+     *    -...-
+     *        A
+     *         D
+     *
+     *    +...+
+     *    B
+     *    D
+     *
+     *    0...0
+     *    C
+     *
+     *    -...-0...0
+     *        AC
+     *
+     *    0...0+...+
+     *    C    B
+     *
+     *    -...-+...+
+     *        AB
+     *         D
+     *
+     *    -...-0...0+...+
+     *        AC    B
+     */
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find(const dmtcmp_t &extra, int direction, uint32_t *const value_size, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    /**
+     * Effect: Return the size (in bytes) of the dmt, as it resides in main memory.
+     * If the data stored are pointers, don't include the size of what they all point to.
+     * //TODO(leif or yoni): (maybe rename and) return memory footprint instead of allocated size
+     */
+    size_t memory_size(void);
+
+    // Returns whether all values in the dmt are known to be the same size.
+    // Note:
+    //  There are no false positives, but false negatives are allowed.
+    //  A false negative can happen if this dmt had 2 (or more) different size values,
+    //  and then enough were deleted so that all the remaining ones are the same size.
+    //  Once that happens, this dmt will never again return true for this function unless/until
+    //  ::clear() is called
+    bool value_length_is_fixed(void) const;
+
+
+    // If this dmt is empty, return value is undefined.
+    // else if value_length_is_fixed() then it returns the fixed length.
+    // else returns 0
+    uint32_t get_fixed_length(void) const;
+
+    // Preprocesses the dmt so that serialization can happen quickly.
+    // After this call, serialize_values() can be called but no other mutator function can be called in between.
+    void prepare_for_serialize(void);
+
+private:
+    // Do a bit of verification that subtree and nodes act like packed c structs and do not introduce unnecessary padding for alignment.
+    ENSURE_POD(subtree);
+    static_assert(ALIGNMENT > 0, "ALIGNMENT <= 0");
+    static_assert((ALIGNMENT & (ALIGNMENT - 1)) == 0, "ALIGNMENT not a power of 2");
+    static_assert(sizeof(dmt_node) - sizeof(dmtdata_t) == __builtin_offsetof(dmt_node, value), "value is not last field in node");
+    static_assert(4 * sizeof(uint32_t) == __builtin_offsetof(dmt_node, value), "dmt_node is padded");
+    static_assert(__builtin_offsetof(dmt_node, value) % ALIGNMENT == 0, "dmt_node requires padding for alignment");
+    ENSURE_POD(dmt_node);
+
+    struct dmt_array {
+        uint32_t num_values;
+    };
+
+    struct dmt_tree {
+        subtree root;
+    };
+
+    /*
+    Relationship between values_same_size, d.a.num_values, value_length, is_array:
+    In an empty dmt:
+        is_array is true
+        value_same_size is true
+        value_length is undefined
+        d.a.num_values is 0
+    In a non-empty array dmt:
+        is_array is true
+        values_same_size is true
+        value_length is defined
+        d.a.num_values > 0
+    In a non-empty tree dmt:
+        is_array = false
+        value_same_size is true iff all values have been the same size since the last time the dmt turned into a tree.
+        value_length is defined iff values_same_size is true
+        d.a.num_values is undefined (the memory is used for the tree)
+    Note that in tree form, the dmt keeps track of if all values are the same size until the first time they are not.
+    'values_same_size' will not become true again (even if we change all values to be the same size)
+        until/unless the dmt becomes empty, at which point it becomes an array again.
+     */
+    bool values_same_size;
+    uint32_t value_length;  // valid iff values_same_size is true.
+    struct mempool mp;
+    bool is_array;
+    union {
+        struct dmt_array a;
+        struct dmt_tree t;
+    } d;
+
+    // Returns pad bytes per element (for alignment) or 0 if not fixed length.
+    uint32_t get_fixed_length_alignment_overhead(void) const;
+
+    void verify_internal(const subtree &subtree, std::vector<bool> *touched) const;
+
+    // Retrieves the node for a given subtree.
+    // Requires: !subtree.is_null()
+    dmt_node & get_node(const subtree &subtree) const;
+
+    // Retrieves the node at a given offset in the mempool.
+    dmt_node & get_node(const node_offset offset) const;
+
+    // Returns the weight of a subtree rooted at st.
+    // if st.is_null(), returns 0
+    // Perf: O(1)
+    uint32_t nweight(const subtree &st) const;
+
+    // Allocates space for a node (in the mempool) and uses the dmtwriter to write the value into the node
+    node_offset node_malloc_and_set_value(const dmtwriter_t &value);
+
+    // Uses the dmtwriter to write a value into node n
+    void node_set_value(dmt_node *n, const dmtwriter_t &value);
+
+    // (mempool-)free the memory for a node
+    void node_free(const subtree &st);
+
+    // Effect: Resizes the mempool (holding the array) if necessary to hold one more item of length: this->value_length
+    // Requires:
+    //  This dmt is in array form (and thus this->values_same_length)
+    void maybe_resize_array_for_insert(void);
+
+    // Effect: Converts a dmt from array form to tree form.
+    // Perf: O(n)
+    // Note: This does not clear the 'this->values_same_size' bit
+    void convert_to_tree(void);
+
+    // Effect: Resizes the mempool holding a tree if necessary.  If value==nullptr then it may shrink if overallocated,
+    //         otherwise resize only happens if there is not enough free space for an insert of value
+    void maybe_resize_tree(const dmtwriter_t * value);
+
+    // Returns true if the tree rooted at st would need rebalance after adding
+    // leftmod to the left subtree and rightmod to the right subtree
+    bool will_need_rebalance(const subtree &st, const int leftmod, const int rightmod) const;
+
+    __attribute__((nonnull))
+    void insert_internal(subtree *const subtreep, const dmtwriter_t &value, const uint32_t idx, subtree **const rebalance_subtree);
+
+    template<bool with_resize>
+    int insert_at_array_end(const dmtwriter_t& value_in);
+
+    dmtdata_t * alloc_array_value_end(void);
+
+    dmtdata_t * get_array_value(const uint32_t idx) const;
+
+    dmtdata_t * get_array_value_internal(const struct mempool *mempool, const uint32_t idx) const;
+
+    void convert_from_array_to_tree(void);
+
+    void convert_from_tree_to_array(void);
+
+    void delete_internal(subtree *const subtreep, const uint32_t idx, subtree *const subtree_replace, subtree **const rebalance_subtree);
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_internal_array(const uint32_t left, const uint32_t right,
+                                      iterate_extra_t *const iterate_extra) const;
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr_internal(const uint32_t left, const uint32_t right,
+                                     const subtree &subtree, const uint32_t idx,
+                                     iterate_extra_t *const iterate_extra);
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+                                           iterate_extra_t *const iterate_extra);
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_internal(const uint32_t left, const uint32_t right,
+                                const subtree &subtree, const uint32_t idx,
+                                iterate_extra_t *const iterate_extra) const;
+
+    void fetch_internal_array(const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const;
+
+    void fetch_internal(const subtree &subtree, const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const;
+
+    __attribute__((nonnull))
+    void fill_array_with_subtree_offsets(node_offset *const array, const subtree &subtree) const;
+
+    __attribute__((nonnull))
+    void rebuild_subtree_from_offsets(subtree *const subtree, const node_offset *const offsets, const uint32_t numvalues);
+
+    __attribute__((nonnull))
+    void rebalance(subtree *const subtree);
+
+    static void copyout(uint32_t *const outlen, dmtdata_t *const out, const dmt_node *const n);
+
+    static void copyout(uint32_t *const outlen, dmtdata_t **const out, dmt_node *const n);
+
+    static void copyout(uint32_t *const outlen, dmtdata_t *const out, const uint32_t len, const dmtdata_t *const stored_value_ptr);
+
+    static void copyout(uint32_t *const outlen, dmtdata_t **const out, const uint32_t len, dmtdata_t *const stored_value_ptr);
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_zero_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_zero(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_plus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_plus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_minus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_minus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    // Allocate memory for an array:  node_offset[num_idx] from pre-allocated contiguous free space in the mempool.
+    // If there is not enough space, returns nullptr.
+    node_offset* alloc_temp_node_offsets(uint32_t num_idxs);
+
+    // Returns the aligned size of x.
+    // If x % ALIGNMENT == 0, returns x
+    // o.w. returns x + (ALIGNMENT - (x % ALIGNMENT))
+    uint32_t align(const uint32_t x) const;
+};
+
+} // namespace toku
+
+// include the implementation here
+#include "dmt.cc"
+
diff --git a/storage/tokudb/PerconaFT/util/doubly_linked_list.h b/storage/tokudb/PerconaFT/util/doubly_linked_list.h
new file mode 100644
index 00000000..25ddaaa3
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/doubly_linked_list.h
@@ -0,0 +1,174 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+//******************************************************************************
+//
+// Overview: A doubly linked list with elements of type T.
+//   Each element that wants to be put into the list provides a
+//   LinkedListElement<T> as well as a pointer to the the object of type T.
+//   Typically, the user embeds the linked list element into the object itself,
+//   for example as
+//     struct foo {
+//       toku::LinkedListElement<struct foo *> linked_list_elt;
+//       ... other elements of foo
+//     };
+//   then when inserting foo into a list defined as 
+//      toku::DoublyLinkedList<struct foo *> list_of_foos;
+//   you write
+//      struct foo f;
+//      list_of_foos->insert(&f->linked_list_elt, &f);
+//
+// Operations:  Constructor and deconstructors are provided (they don't 
+//   need to anything but fill in a field) for the DoublyLinkedList.
+//   Operations to insert an element and remove it, as well as to pop
+//   an element out of the list.
+//   Also a LinkedListElement class is provided with a method to get a
+//   pointer to the object of type T.
+//******************************************************************************
+
+#include <stdbool.h>
+#include <portability/toku_assert.h>
+
+namespace toku {
+
+template<typename T> class DoublyLinkedList;
+
+template<typename T> class LinkedListElement {
+    friend class DoublyLinkedList<T>;
+ private:
+    T container;
+    LinkedListElement<T> *prev, *next;
+ public:
+    T get_container(void) {
+	return container;
+    }
+};
+
+template<typename T> class DoublyLinkedList {
+ public:
+    void init (void);
+    // Effect: Initialize a doubly linked list (to be empty).
+
+    void insert(LinkedListElement<T> *ll_elt, T container); 
+    // Effect: Add an item to a linked list.
+    // Implementation note: Push the item to the head of the list.
+
+    void remove(LinkedListElement<T> *ll_elt);
+    // Effect: Remove an item from a linked list.
+    // Requires: The item is in the list identified by head.
+
+    bool pop(LinkedListElement<T> **ll_eltp);
+    // Effect: if the list is empty, return false.
+    //   Otherwise return true and set *ll_eltp to the first item, and remove that item from the list.
+
+    template<typename extra_t> int iterate(int (*fun)(T container, extra_t extra), extra_t extra);
+    // Effect: Call fun(e, extra) on every element of the linked list.  If ever fun returns nonzero, then quit early and return that value.
+    //  If fun always return zero, then this function returns zero.
+
+ private:
+    LinkedListElement<T> *m_first;
+};
+
+//******************************************************************************
+// DoublyLinkedList implementation starts here.
+//******************************************************************************
+
+#include <stddef.h>
+
+
+
+template<typename T> void DoublyLinkedList<T>::init(void) {
+    m_first     = NULL;
+}
+
+template<typename T> void DoublyLinkedList<T>::insert(LinkedListElement<T> *ll_elt, T container) {
+    LinkedListElement<T> *old_first = m_first;
+    ll_elt->container = container;
+    ll_elt->next      = old_first;
+    ll_elt->prev      = NULL;
+    if (old_first!=NULL) {
+	old_first->prev = ll_elt;
+    }
+    m_first = ll_elt;
+}
+
+template<typename T> void DoublyLinkedList<T>::remove(LinkedListElement<T> *ll_elt) {
+    LinkedListElement<T> *old_prev = ll_elt->prev;
+    LinkedListElement<T> *old_next = ll_elt->next;
+
+    if (old_prev==NULL) {
+	m_first = old_next;
+    } else {
+	old_prev->next = old_next;
+    }
+    if (old_next==NULL) {
+	/* nothing */
+    } else {
+	old_next->prev = old_prev;
+    }
+}
+
+template<typename T> bool DoublyLinkedList<T>::pop(LinkedListElement<T> **ll_eltp) {
+    LinkedListElement<T> *first = m_first;
+    if (first) {
+	invariant(first->prev==NULL);
+	m_first = first->next;
+	if (first->next) {
+	    first->next->prev = NULL;
+	}
+	first->next=NULL;
+	*ll_eltp = first;
+	return true;
+    } else {
+	return false;
+    }
+}
+
+template<typename T>
+template<typename extra_t>
+int DoublyLinkedList<T>::iterate(int (*fun)(T container, extra_t extra), extra_t extra) {
+    for (LinkedListElement<T> *le = m_first; le; le=le->next) {
+	int r = fun(le->container, extra);
+	if (r!=0) return r;
+    }
+    return 0;
+}
+
+}
diff --git a/storage/tokudb/PerconaFT/util/fmutex.h b/storage/tokudb/PerconaFT/util/fmutex.h
new file mode 100644
index 00000000..fed1bc24
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/fmutex.h
@@ -0,0 +1,146 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+extern toku_instr_key *fmutex_cond_key;
+
+// fair mutex
+struct fmutex {
+    pthread_mutex_t mutex;
+    int mutex_held;
+    int num_want_mutex;
+    struct queue_item *wait_head;
+    struct queue_item *wait_tail;
+};
+
+// item on the queue
+struct queue_item {
+    pthread_cond_t *cond;
+    struct queue_item *next;
+};
+
+static void enq_item(struct fmutex *fm, struct queue_item *const item) {
+    assert(item->next == NULL);
+    if (fm->wait_tail != NULL) {
+        fm->wait_tail->next = item;
+    } else {
+        assert(fm->wait_head == NULL);
+        fm->wait_head = item;
+    }
+    fm->wait_tail = item;
+}
+
+static pthread_cond_t *deq_item(struct fmutex *fm) {
+    assert(fm->wait_head != NULL);
+    assert(fm->wait_tail != NULL);
+    struct queue_item *item = fm->wait_head;
+    fm->wait_head = fm->wait_head->next;
+    if (fm->wait_tail == item) {
+        fm->wait_tail = NULL;
+    }
+    return item->cond;
+}
+
+void fmutex_create(struct fmutex *fm) {
+    pthread_mutex_init(&fm->mutex, NULL);
+    fm->mutex_held = 0;
+    fm->num_want_mutex = 0;
+    fm->wait_head = NULL;
+    fm->wait_tail = NULL;
+}
+
+void fmutex_destroy(struct fmutex *fm) {
+    pthread_mutex_destroy(&fm->mutex);
+}
+
+// Prerequisite: Holds m_mutex.
+void fmutex_lock(struct fmutex *fm) {
+    pthread_mutex_lock(&fm->mutex);
+
+    if (fm->mutex_held == 0 || fm->num_want_mutex == 0) {
+        // No one holds the lock.  Grant the write lock.
+        fm->mutex_held = 1;
+        return;
+    }
+
+    pthread_cond_t cond;
+    pthread_cond_init(*fmutex_cond_key, &cond, nullptr);
+    struct queue_item item = {.cond = &cond, .next = NULL};
+    enq_item(fm, &item);
+
+    // Wait for our turn.
+    ++fm->num_want_mutex;
+    pthread_cond_wait(&cond, &fm->mutex);
+    pthread_cond_destroy(&cond);
+
+    // Now it's our turn.
+    assert(fm->num_want_mutex > 0);
+    assert(fm->mutex_held == 0);
+
+    // Not waiting anymore; grab the lock.
+    --fm->num_want_mutex;
+    fm->mutex_held = 1;
+
+    pthread_mutex_unlock();
+}
+
+void fmutex_mutex_unlock(struct fmutex *fm) {
+    pthread_mutex_lock();
+
+    fm->mutex_held = 0;
+    if (fm->wait_head == NULL) {
+        assert(fm->num_want_mutex == 0);
+        return;
+    }
+    assert(fm->num_want_mutex > 0);
+
+    // Grant lock to the next waiter
+    pthread_cond_t *cond = deq_item(fm);
+    pthread_cond_signal(cond);
+
+    pthread_mutex_unlock();
+}
+
+int fmutex_users(struct fmutex *fm) const {
+    return fm->mutex_held + fm->num_want_mutex;
+}
+
+int fmutex_blocked_users(struct fmutex *fm) const {
+    return fm->num_want_mutex;
+}
diff --git a/storage/tokudb/PerconaFT/util/frwlock.cc b/storage/tokudb/PerconaFT/util/frwlock.cc
new file mode 100644
index 00000000..1f821fe5
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/frwlock.cc
@@ -0,0 +1,351 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <toku_assert.h>
+
+#include <util/context.h>
+#include <util/frwlock.h>
+
+toku_instr_key *frwlock_m_wait_read_key;
+
+namespace toku {
+
+    static __thread int thread_local_tid = -1;
+    static int get_local_tid() {
+        if (thread_local_tid == -1) {
+            thread_local_tid = toku_os_gettid();
+        }
+        return thread_local_tid;
+    }
+
+    void frwlock::init(toku_mutex_t *const mutex
+#if defined(TOKU_MYSQL_WITH_PFS)
+                       ,
+                       const toku_instr_key &rwlock_instr_key
+#endif
+                       ) {
+        m_mutex = mutex;
+
+        m_num_readers = 0;
+        m_num_writers = 0;
+        m_num_want_write = 0;
+        m_num_want_read = 0;
+        m_num_signaled_readers = 0;
+        m_num_expensive_want_write = 0;
+#if defined(TOKU_MYSQL_WITH_PFS)
+        toku_pthread_rwlock_init(rwlock_instr_key, &m_rwlock, nullptr);
+#endif
+        toku_cond_init(toku_uninstrumented, &m_wait_read, nullptr);
+        m_queue_item_read.cond = &m_wait_read;
+        m_queue_item_read.next = nullptr;
+        m_wait_read_is_in_queue = false;
+        m_current_writer_expensive = false;
+        m_read_wait_expensive = false;
+        m_current_writer_tid = -1;
+        m_blocking_writer_context_id = CTX_INVALID;
+
+        m_wait_head = nullptr;
+        m_wait_tail = nullptr;
+    }
+
+    void frwlock::deinit(void) {
+        toku_cond_destroy(&m_wait_read);
+#if defined(TOKU_MYSQL_WITH_PFS)
+        toku_pthread_rwlock_destroy(&m_rwlock);
+#endif
+    }
+
+    bool frwlock::queue_is_empty(void) const { return m_wait_head == nullptr; }
+
+    void frwlock::enq_item(queue_item *const item) {
+        paranoid_invariant_null(item->next);
+        if (m_wait_tail != nullptr) {
+            m_wait_tail->next = item;
+        } else {
+            paranoid_invariant_null(m_wait_head);
+            m_wait_head = item;
+        }
+        m_wait_tail = item;
+    }
+
+    toku_cond_t *frwlock::deq_item(void) {
+        paranoid_invariant_notnull(m_wait_head);
+        paranoid_invariant_notnull(m_wait_tail);
+        queue_item *item = m_wait_head;
+        m_wait_head = m_wait_head->next;
+        if (m_wait_tail == item) {
+            m_wait_tail = nullptr;
+        }
+        return item->cond;
+    }
+
+    // Prerequisite: Holds m_mutex.
+    void frwlock::write_lock(bool expensive) {
+#if defined(TOKU_MYSQL_WITH_PFS)
+        /* Instrumentation start */
+        toku_rwlock_instrumentation rwlock_instr;
+        toku_instr_rwlock_wrlock_wait_start(
+            rwlock_instr, m_rwlock, __FILE__, __LINE__);
+#endif
+
+        toku_mutex_assert_locked(m_mutex);
+        if (this->try_write_lock(expensive)) {
+#if defined(TOKU_MYSQL_WITH_PFS)
+            /* Instrumentation end */
+            toku_instr_rwlock_wrlock_wait_end(rwlock_instr, 0);
+#endif
+            return;
+        }
+
+        toku_cond_t cond = TOKU_COND_INITIALIZER;
+        queue_item item = {.cond = &cond, .next = nullptr};
+        this->enq_item(&item);
+
+        // Wait for our turn.
+        ++m_num_want_write;
+        if (expensive) {
+            ++m_num_expensive_want_write;
+        }
+        if (m_num_writers == 0 && m_num_want_write == 1) {
+            // We are the first to want a write lock. No new readers can get the
+            // lock.
+            // Set our thread id and context for proper instrumentation.
+            // see: toku_context_note_frwlock_contention()
+            m_current_writer_tid = get_local_tid();
+            m_blocking_writer_context_id = toku_thread_get_context()->get_id();
+        }
+        toku_cond_wait(&cond, m_mutex);
+        toku_cond_destroy(&cond);
+
+        // Now it's our turn.
+        paranoid_invariant(m_num_want_write > 0);
+        paranoid_invariant_zero(m_num_readers);
+        paranoid_invariant_zero(m_num_writers);
+        paranoid_invariant_zero(m_num_signaled_readers);
+
+        // Not waiting anymore; grab the lock.
+        --m_num_want_write;
+        if (expensive) {
+            --m_num_expensive_want_write;
+        }
+        m_num_writers = 1;
+        m_current_writer_expensive = expensive;
+        m_current_writer_tid = get_local_tid();
+        m_blocking_writer_context_id = toku_thread_get_context()->get_id();
+
+#if defined(TOKU_MYSQL_WITH_PFS)
+        /* Instrumentation end */
+        toku_instr_rwlock_wrlock_wait_end(rwlock_instr, 0);
+#endif
+    }
+
+    bool frwlock::try_write_lock(bool expensive) {
+        toku_mutex_assert_locked(m_mutex);
+        if (m_num_readers > 0 || m_num_writers > 0 ||
+            m_num_signaled_readers > 0 || m_num_want_write > 0) {
+            return false;
+        }
+        // No one holds the lock.  Grant the write lock.
+        paranoid_invariant_zero(m_num_want_write);
+        paranoid_invariant_zero(m_num_want_read);
+        m_num_writers = 1;
+        m_current_writer_expensive = expensive;
+        m_current_writer_tid = get_local_tid();
+        m_blocking_writer_context_id = toku_thread_get_context()->get_id();
+        return true;
+    }
+
+    void frwlock::read_lock(void) {
+#if defined(TOKU_MYSQL_WITH_PFS)
+        /* Instrumentation start */
+        toku_rwlock_instrumentation rwlock_instr;
+        toku_instr_rwlock_rdlock_wait_start(
+            rwlock_instr, m_rwlock, __FILE__, __LINE__);
+#endif
+        toku_mutex_assert_locked(m_mutex);
+        if (m_num_writers > 0 || m_num_want_write > 0) {
+            if (!m_wait_read_is_in_queue) {
+                // Throw the read cond_t onto the queue.
+                paranoid_invariant(m_num_signaled_readers == m_num_want_read);
+                m_queue_item_read.next = nullptr;
+                this->enq_item(&m_queue_item_read);
+                m_wait_read_is_in_queue = true;
+                paranoid_invariant(!m_read_wait_expensive);
+                m_read_wait_expensive = (m_current_writer_expensive ||
+                                         (m_num_expensive_want_write > 0));
+            }
+
+            // Note this contention event in engine status.
+            toku_context_note_frwlock_contention(
+                toku_thread_get_context()->get_id(),
+                m_blocking_writer_context_id);
+
+            // Wait for our turn.
+            ++m_num_want_read;
+            toku_cond_wait(&m_wait_read, m_mutex);
+
+            // Now it's our turn.
+            paranoid_invariant_zero(m_num_writers);
+            paranoid_invariant(m_num_want_read > 0);
+            paranoid_invariant(m_num_signaled_readers > 0);
+
+            // Not waiting anymore; grab the lock.
+            --m_num_want_read;
+            --m_num_signaled_readers;
+        }
+        ++m_num_readers;
+#if defined(TOKU_MYSQL_WITH_PFS)
+        /* Instrumentation end */
+        toku_instr_rwlock_rdlock_wait_end(rwlock_instr, 0);
+#endif
+    }
+
+    bool frwlock::try_read_lock(void) {
+        toku_mutex_assert_locked(m_mutex);
+        if (m_num_writers > 0 || m_num_want_write > 0) {
+            return false;
+        }
+        // No writer holds the lock.
+        // No writers are waiting.
+        // Grant the read lock.
+        ++m_num_readers;
+        return true;
+    }
+
+    void frwlock::maybe_signal_next_writer(void) {
+        if (m_num_want_write > 0 && m_num_signaled_readers == 0 &&
+            m_num_readers == 0) {
+            toku_cond_t *cond = this->deq_item();
+            paranoid_invariant(cond != &m_wait_read);
+            // Grant write lock to waiting writer.
+            paranoid_invariant(m_num_want_write > 0);
+            toku_cond_signal(cond);
+        }
+    }
+
+    void frwlock::read_unlock(void) {
+#ifdef TOKU_MYSQL_WITH_PFS
+        toku_instr_rwlock_unlock(m_rwlock);
+#endif
+        toku_mutex_assert_locked(m_mutex);
+        paranoid_invariant(m_num_writers == 0);
+        paranoid_invariant(m_num_readers > 0);
+        --m_num_readers;
+        this->maybe_signal_next_writer();
+    }
+
+    bool frwlock::read_lock_is_expensive(void) {
+        toku_mutex_assert_locked(m_mutex);
+        if (m_wait_read_is_in_queue) {
+            return m_read_wait_expensive;
+        } else {
+            return m_current_writer_expensive ||
+                   (m_num_expensive_want_write > 0);
+        }
+    }
+
+    void frwlock::maybe_signal_or_broadcast_next(void) {
+        paranoid_invariant(m_num_signaled_readers == 0);
+
+        if (this->queue_is_empty()) {
+            paranoid_invariant(m_num_want_write == 0);
+            paranoid_invariant(m_num_want_read == 0);
+            return;
+        }
+        toku_cond_t *cond = this->deq_item();
+        if (cond == &m_wait_read) {
+            // Grant read locks to all waiting readers
+            paranoid_invariant(m_wait_read_is_in_queue);
+            paranoid_invariant(m_num_want_read > 0);
+            m_num_signaled_readers = m_num_want_read;
+            m_wait_read_is_in_queue = false;
+            m_read_wait_expensive = false;
+            toku_cond_broadcast(cond);
+        } else {
+            // Grant write lock to waiting writer.
+            paranoid_invariant(m_num_want_write > 0);
+            toku_cond_signal(cond);
+        }
+    }
+
+    void frwlock::write_unlock(void) {
+#if defined(TOKU_MYSQL_WITH_PFS)
+        toku_instr_rwlock_unlock(m_rwlock);
+#endif
+        toku_mutex_assert_locked(m_mutex);
+        paranoid_invariant(m_num_writers == 1);
+        m_num_writers = 0;
+        m_current_writer_expensive = false;
+        m_current_writer_tid = -1;
+        m_blocking_writer_context_id = CTX_INVALID;
+        this->maybe_signal_or_broadcast_next();
+    }
+    bool frwlock::write_lock_is_expensive(void) {
+        toku_mutex_assert_locked(m_mutex);
+        return (m_num_expensive_want_write > 0) || (m_current_writer_expensive);
+    }
+
+    uint32_t frwlock::users(void) const {
+        toku_mutex_assert_locked(m_mutex);
+        return m_num_readers + m_num_writers + m_num_want_read +
+               m_num_want_write;
+    }
+    uint32_t frwlock::blocked_users(void) const {
+        toku_mutex_assert_locked(m_mutex);
+        return m_num_want_read + m_num_want_write;
+    }
+    uint32_t frwlock::writers(void) const {
+        // this is sometimes called as "assert(lock->writers())" when we
+        // assume we have the write lock.  if that's the assumption, we may
+        // not own the mutex, so we don't assert_locked here
+        return m_num_writers;
+    }
+    uint32_t frwlock::blocked_writers(void) const {
+        toku_mutex_assert_locked(m_mutex);
+        return m_num_want_write;
+    }
+    uint32_t frwlock::readers(void) const {
+        toku_mutex_assert_locked(m_mutex);
+        return m_num_readers;
+    }
+    uint32_t frwlock::blocked_readers(void) const {
+        toku_mutex_assert_locked(m_mutex);
+        return m_num_want_read;
+    }
+
+} // namespace toku
diff --git a/storage/tokudb/PerconaFT/util/frwlock.h b/storage/tokudb/PerconaFT/util/frwlock.h
new file mode 100644
index 00000000..b02d95e5
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/frwlock.h
@@ -0,0 +1,131 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <toku_portability.h>
+#include <toku_pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <util/context.h>
+
+//TODO: update comment, this is from rwlock.h
+
+namespace toku {
+
+    class frwlock {
+       public:
+        void init(toku_mutex_t *const mutex
+#if defined(TOKU_MYSQL_WITH_PFS)
+                  ,
+                  const toku_instr_key &rwlock_instr_key
+#endif
+                  );
+        void deinit(void);
+
+        void write_lock(bool expensive);
+        bool try_write_lock(bool expensive);
+        void write_unlock(void);
+        // returns true if acquiring a write lock will be expensive
+        bool write_lock_is_expensive(void);
+
+        void read_lock(void);
+        bool try_read_lock(void);
+        void read_unlock(void);
+        // returns true if acquiring a read lock will be expensive
+        bool read_lock_is_expensive(void);
+
+        uint32_t users(void) const;
+        uint32_t blocked_users(void) const;
+        uint32_t writers(void) const;
+        uint32_t blocked_writers(void) const;
+        uint32_t readers(void) const;
+        uint32_t blocked_readers(void) const;
+
+       private:
+        struct queue_item {
+            toku_cond_t *cond;
+            struct queue_item *next;
+        };
+
+        bool queue_is_empty(void) const;
+        void enq_item(queue_item *const item);
+        toku_cond_t *deq_item(void);
+        void maybe_signal_or_broadcast_next(void);
+        void maybe_signal_next_writer(void);
+
+        toku_mutex_t *m_mutex;
+
+        uint32_t m_num_readers;
+        uint32_t m_num_writers;
+        uint32_t m_num_want_write;
+        uint32_t m_num_want_read;
+        uint32_t m_num_signaled_readers;
+        // number of writers waiting that are expensive
+        // MUST be < m_num_want_write
+        uint32_t m_num_expensive_want_write;
+        // bool that states if the current writer is expensive
+        // if there is no current writer, then is false
+        bool m_current_writer_expensive;
+        // bool that states if waiting for a read
+        // is expensive
+        // if there are currently no waiting readers, then set to false
+        bool m_read_wait_expensive;
+        // thread-id of the current writer
+        int m_current_writer_tid;
+        // context id describing the context of the current writer blocking
+        // new readers (either because this writer holds the write lock or
+        // is the first to want the write lock).
+        context_id m_blocking_writer_context_id;
+        queue_item m_queue_item_read;
+        bool m_wait_read_is_in_queue;
+
+        toku_cond_t m_wait_read;
+#if defined(TOKU_MYSQL_WITH_PFS)
+        toku_pthread_rwlock_t m_rwlock;
+#endif
+        queue_item *m_wait_head;
+        queue_item *m_wait_tail;
+    };
+
+    ENSURE_POD(frwlock);
+
+} // namespace toku
+
+// include the implementation here
+// #include "frwlock.cc"
diff --git a/storage/tokudb/PerconaFT/util/growable_array.h b/storage/tokudb/PerconaFT/util/growable_array.h
new file mode 100644
index 00000000..bc9e67af
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/growable_array.h
@@ -0,0 +1,138 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+
+//******************************************************************************
+//
+// Overview: A growable array is a little bit like std::vector except that
+//  it doesn't have constructors (hence can be used in static constructs, since
+//  the google style guide says no constructors), and it's a little simpler.
+// Operations:
+//   init and deinit (we don't have constructors and destructors).
+//   fetch_unchecked to get values out.
+//   store_unchecked to put values in.
+//   push to add an element at the end
+//   get_size to find out the size
+//   get_memory_size to find out how much memory the data stucture is using.
+//
+//******************************************************************************
+
+namespace toku {
+
+template<typename T> class GrowableArray {
+ public:
+    void init (void)
+    // Effect: Initialize the array to contain no elements.
+    {
+	m_array=NULL;
+	m_size=0;
+	m_size_limit=0;
+    }
+
+    void deinit (void)
+    // Effect: Deinitialize the array (freeing any memory it uses, for example).
+    {
+	toku_free(m_array);
+	m_array     =NULL;
+	m_size      =0;
+	m_size_limit=0;
+    }
+
+    T fetch_unchecked (size_t i) const
+    // Effect: Fetch the ith element.  If i is out of range, the system asserts.
+    {
+	return m_array[i];
+    }
+
+    void store_unchecked (size_t i, T v)
+    // Effect: Store v in the ith element.  If i is out of range, the system asserts.
+    {
+	paranoid_invariant(i<m_size);
+	m_array[i]=v;
+    }
+
+    void push (T v)
+    // Effect: Add v to the end of the array (increasing the size).  The amortized cost of this operation is constant.
+    // Implementation hint:  Double the size of the array when it gets too big so that the amortized cost stays constant.
+    {
+	if (m_size>=m_size_limit) {
+	    if (m_array==NULL) {
+		m_size_limit=1;
+	    } else {
+		m_size_limit*=2;
+	    }
+	    XREALLOC_N(m_size_limit, m_array);
+	}
+	m_array[m_size++]=v;
+    }
+
+    size_t get_size (void) const
+    // Effect: Return the number of elements in the array.
+    {
+	return m_size;
+    }
+    size_t memory_size(void) const
+    // Effect: Return the size (in bytes) that the array occupies in memory.  This is really only an estimate.
+    {
+	return sizeof(*this)+sizeof(T)*m_size_limit;
+    }
+
+ private:
+    T     *m_array;
+    size_t m_size;
+    size_t m_size_limit; // How much space is allocated in array.
+};
+
+}
diff --git a/storage/tokudb/PerconaFT/util/kibbutz.cc b/storage/tokudb/PerconaFT/util/kibbutz.cc
new file mode 100644
index 00000000..409bf6bd
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/kibbutz.cc
@@ -0,0 +1,242 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <memory.h>
+
+#include <portability/toku_config.h>
+#include <portability/toku_time.h>
+#include <toku_pthread.h>
+
+#include "kibbutz.h"
+
+// A Kibbutz is a collection of workers and some work to do.
+struct todo {
+    void (*f)(void *extra);
+    void *extra;
+    struct todo *next;
+    struct todo *prev;
+};
+
+struct kid {
+    struct kibbutz *k;
+};
+
+struct kibbutz {
+    toku_mutex_t mutex;
+    toku_cond_t  cond;
+    bool please_shutdown;
+    struct todo *head, *tail; // head is the next thing to do.
+    int n_workers;
+    pthread_t *workers; // an array of n_workers
+    struct kid *ids;    // pass this in when creating a worker so it knows who it is.
+
+    uint64_t threads_active;
+    uint64_t queue_size;
+    uint64_t max_queue_size;
+    uint64_t total_items_processed;
+    uint64_t total_execution_time;
+};
+
+static void *work_on_kibbutz(void *);
+
+toku_instr_key *kibbutz_mutex_key;
+toku_instr_key *kibbutz_k_cond_key;
+toku_instr_key *kibbutz_thread_key;
+
+int toku_kibbutz_create(int n_workers, KIBBUTZ *kb_ret) {
+    int r = 0;
+    *kb_ret = NULL;
+    KIBBUTZ XCALLOC(k);
+    toku_mutex_init(*kibbutz_mutex_key, &k->mutex, nullptr);
+    toku_cond_init(*kibbutz_k_cond_key, &k->cond, nullptr);
+    k->please_shutdown = false;
+    k->head = NULL;
+    k->tail = NULL;
+    k->n_workers = n_workers;
+    k->threads_active = 0;
+    k->queue_size = 0;
+    k->max_queue_size = 0;
+    k->total_items_processed = 0;
+    k->total_execution_time = 0;
+    XMALLOC_N(n_workers, k->workers);
+    XMALLOC_N(n_workers, k->ids);
+    for (int i = 0; i < n_workers; i++) {
+        k->ids[i].k = k;
+        r = toku_pthread_create(*kibbutz_thread_key,
+                                &k->workers[i],
+                                nullptr,
+                                work_on_kibbutz,
+                                &k->ids[i]);
+        if (r != 0) {
+            k->n_workers = i;
+            toku_kibbutz_destroy(k);
+            break;
+        }
+    }
+    if (r == 0) {
+        *kb_ret = k;
+    }
+    return r;
+}
+
+static void klock (KIBBUTZ k) {
+    toku_mutex_lock(&k->mutex);
+}
+static void kunlock (KIBBUTZ k) {
+    toku_mutex_unlock(&k->mutex);
+}
+static void kwait (KIBBUTZ k) {
+    toku_cond_wait(&k->cond, &k->mutex);
+}
+static void ksignal (KIBBUTZ k) {
+    toku_cond_signal(&k->cond);
+}
+
+//
+// pops the tail of the kibbutz off the list and works on it
+// Note that in toku_kibbutz_enq, items are enqueued at the head,
+// making the work be done in FIFO order. This is necessary
+// to avoid deadlocks in flusher threads.
+//
+static void *work_on_kibbutz (void *kidv) {
+    struct kid *CAST_FROM_VOIDP(kid, kidv);
+    KIBBUTZ k = kid->k;
+    klock(k);
+    while (1) {
+        while (k->tail) {
+            struct todo *item = k->tail;
+            k->tail = item->prev;
+            toku_sync_sub_and_fetch(&k->queue_size, 1);
+            if (k->tail==NULL) {
+                k->head=NULL;
+            } else {
+                // if there are other things to do, then wake up the next guy, if there is one.
+                ksignal(k);
+            }
+            kunlock(k);
+            toku_sync_add_and_fetch(&k->threads_active, 1);
+            uint64_t starttime = toku_current_time_microsec();
+            item->f(item->extra);
+            uint64_t duration = toku_current_time_microsec() - starttime;
+            toku_sync_add_and_fetch(&k->total_execution_time, duration);
+            toku_sync_add_and_fetch(&k->total_items_processed, 1);
+            toku_sync_sub_and_fetch(&k->threads_active, 1);
+            toku_free(item);
+            klock(k);
+            // if there's another item on k->head, then we'll just go grab it now, without waiting for a signal.
+        }
+        if (k->please_shutdown) {
+            // Don't follow this unless the work is all done, so that when we
+            // set please_shutdown, all the work finishes before any threads
+            // quit.
+            ksignal(k);  // must wake up anyone else who is waiting, so they can
+                         // shut down.
+            kunlock(k);
+            toku_instr_delete_current_thread();
+            return nullptr;
+        }
+        // There is no work to do and it's not time to shutdown, so wait.
+        kwait(k);
+    }
+}
+
+//
+// adds work to the head of the kibbutz 
+// Note that in work_on_kibbutz, items are popped off the tail for work,
+// making the work be done in FIFO order. This is necessary
+// to avoid deadlocks in flusher threads.
+//
+void toku_kibbutz_enq (KIBBUTZ k, void (*f)(void*), void *extra) {
+    struct todo *XMALLOC(td);
+    td->f = f;
+    td->extra = extra;
+    klock(k);
+    assert(!k->please_shutdown);
+    td->next = k->head;
+    td->prev = NULL;
+    if (k->head) {
+        assert(k->head->prev == NULL);
+        k->head->prev = td;
+    }
+    k->head = td;
+    if (k->tail==NULL) k->tail = td;
+
+    uint64_t newsize = toku_sync_add_and_fetch(&k->queue_size, 1);
+    // not exactly precise but we'll live with it
+    if (newsize > k->max_queue_size) k->max_queue_size = k->queue_size;
+
+    ksignal(k);
+    kunlock(k);
+}
+
+void toku_kibbutz_get_status(KIBBUTZ k,
+                             uint64_t *num_threads,
+                             uint64_t *num_threads_active,
+                             uint64_t *queue_size,
+                             uint64_t *max_queue_size,
+                             uint64_t *total_items_processed,
+                             uint64_t *total_execution_time) {
+    *num_threads = k->n_workers;
+    *num_threads_active = k->threads_active;
+    *queue_size = k->queue_size;
+    *max_queue_size = k->max_queue_size;
+    *total_items_processed = k->total_items_processed;
+    *total_execution_time = k->total_execution_time / 1000; // return in ms.
+}
+
+void toku_kibbutz_destroy (KIBBUTZ k)
+// Effect: wait for all the enqueued work to finish, and then destroy the kibbutz.
+//  Note: It is an error for to perform kibbutz_enq operations after this is called.
+{
+    klock(k);
+    assert(!k->please_shutdown);
+    k->please_shutdown = true;
+    ksignal(k); // must wake everyone up to tell them to shutdown.
+    kunlock(k);
+    for (int i=0; i<k->n_workers; i++) {
+        void *result;
+        int r = toku_pthread_join(k->workers[i], &result);
+        assert(r==0);
+        assert(result==NULL);
+    }
+    toku_free(k->workers);
+    toku_free(k->ids);
+    toku_cond_destroy(&k->cond);
+    toku_mutex_destroy(&k->mutex);
+    toku_free(k);
+}
diff --git a/storage/tokudb/PerconaFT/util/kibbutz.h b/storage/tokudb/PerconaFT/util/kibbutz.h
new file mode 100644
index 00000000..74cd5a6d
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/kibbutz.h
@@ -0,0 +1,74 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+//
+// The kibbutz is another threadpool meant to do arbitrary work.
+//
+
+typedef struct kibbutz *KIBBUTZ;
+//
+// create a kibbutz where n_workers is the number of threads in the threadpool
+//
+int toku_kibbutz_create (int n_workers, KIBBUTZ *kb);
+//
+// enqueue a workitem in the kibbutz. When the kibbutz is to work on this workitem,
+// it calls f(extra). 
+// At any time, the kibbutz is operating on at most n_workers jobs. 
+// Other enqueued workitems are on a queue. An invariant is 
+// that no currently enqueued item was placed on the queue before 
+// any item that is currently being operated on. Another way to state
+// this is that all items on the queue were placed there before any item
+// that is currently being worked on
+//
+void toku_kibbutz_enq (KIBBUTZ k, void (*f)(void*), void *extra);
+//
+// get kibbuts status
+//
+void toku_kibbutz_get_status(KIBBUTZ k,
+                             uint64_t *num_threads,
+                             uint64_t *num_threads_active,
+                             uint64_t *queue_size,
+                             uint64_t *max_queue_size,
+                             uint64_t *total_items_processed,
+                             uint64_t *total_execution_time);
+//
+// destroys the kibbutz
+//
+void toku_kibbutz_destroy (KIBBUTZ k);
diff --git a/storage/tokudb/PerconaFT/util/memarena.cc b/storage/tokudb/PerconaFT/util/memarena.cc
new file mode 100644
index 00000000..8c054221
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/memarena.cc
@@ -0,0 +1,191 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <algorithm>
+#include <string.h>
+#include <memory.h>
+
+#include <util/memarena.h>
+
+void memarena::create(size_t initial_size) {
+    _current_chunk = arena_chunk();
+    _other_chunks = nullptr;
+    _size_of_other_chunks = 0;
+    _footprint_of_other_chunks = 0;
+    _n_other_chunks = 0;
+
+    _current_chunk.size = initial_size;
+    if (_current_chunk.size > 0) {
+        XMALLOC_N(_current_chunk.size, _current_chunk.buf);
+    }
+}
+
+void memarena::destroy(void) {
+    if (_current_chunk.buf) {
+        toku_free(_current_chunk.buf);
+    }
+    for (int i = 0; i < _n_other_chunks; i++) {
+        toku_free(_other_chunks[i].buf);
+    }
+    if (_other_chunks) {
+        toku_free(_other_chunks);
+    }
+    _current_chunk = arena_chunk();
+    _other_chunks = nullptr;
+    _n_other_chunks = 0;
+}
+
+static size_t round_to_page(size_t size) {
+    const size_t page_size = 4096;
+    const size_t r = page_size + ((size - 1) & ~(page_size - 1));
+    assert((r & (page_size - 1)) == 0); // make sure it's aligned
+    assert(r >= size);              // make sure it's not too small
+    assert(r < size + page_size);     // make sure we didn't grow by more than a page.
+    return r;
+}
+
+static const size_t MEMARENA_MAX_CHUNK_SIZE = 64 * 1024 * 1024;
+
+void *memarena::malloc_from_arena(size_t size) {
+    if (_current_chunk.buf == nullptr || _current_chunk.size < _current_chunk.used + size) {
+        // The existing block isn't big enough.
+        // Add the block to the vector of blocks.
+        if (_current_chunk.buf) {
+            invariant(_current_chunk.size > 0);
+            int old_n = _n_other_chunks;
+            XREALLOC_N(old_n + 1, _other_chunks);
+            _other_chunks[old_n] = _current_chunk;
+            _n_other_chunks = old_n + 1;
+            _size_of_other_chunks += _current_chunk.size;
+            _footprint_of_other_chunks += toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+        }
+
+        // Make a new one. Grow the buffer size exponentially until we hit
+        // the max chunk size, but make it at least `size' bytes so the
+        // current allocation always fit.
+        size_t new_size = std::min(MEMARENA_MAX_CHUNK_SIZE, 2 * _current_chunk.size);
+        if (new_size < size) {
+            new_size = size;
+        }
+        new_size = round_to_page(new_size); // at least size, but round to the next page size
+        XMALLOC_N(new_size, _current_chunk.buf);
+        _current_chunk.used = 0;
+        _current_chunk.size = new_size;
+    }
+    invariant(_current_chunk.buf != nullptr);
+
+    // allocate in the existing block.
+    char *p = _current_chunk.buf + _current_chunk.used;
+    _current_chunk.used += size;
+    return p;
+}
+
+void memarena::move_memory(memarena *dest) {
+    // Move memory to dest
+    XREALLOC_N(dest->_n_other_chunks + _n_other_chunks + 1, dest->_other_chunks);
+    dest->_size_of_other_chunks += _size_of_other_chunks + _current_chunk.size;
+    dest->_footprint_of_other_chunks += _footprint_of_other_chunks + toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+    for (int i = 0; i < _n_other_chunks; i++) {
+        dest->_other_chunks[dest->_n_other_chunks++] = _other_chunks[i];
+    }
+    dest->_other_chunks[dest->_n_other_chunks++] = _current_chunk;
+
+    // Clear out this memarena's memory
+    toku_free(_other_chunks);
+    _current_chunk = arena_chunk();
+    _other_chunks = nullptr;
+    _size_of_other_chunks = 0;
+    _footprint_of_other_chunks = 0;
+    _n_other_chunks = 0;
+}
+
+size_t memarena::total_memory_size(void) const {
+    return sizeof(*this) +
+           total_size_in_use() +
+           _n_other_chunks * sizeof(*_other_chunks);
+}
+
+size_t memarena::total_size_in_use(void) const {
+    return _size_of_other_chunks + _current_chunk.used;
+}
+
+size_t memarena::total_footprint(void) const {
+    return sizeof(*this) +
+           _footprint_of_other_chunks +
+           toku_memory_footprint(_current_chunk.buf, _current_chunk.used) +
+           _n_other_chunks * sizeof(*_other_chunks);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+const void *memarena::chunk_iterator::current(size_t *used) const {
+    if (_chunk_idx < 0) {
+        *used = _ma->_current_chunk.used;
+        return _ma->_current_chunk.buf;
+    } else if (_chunk_idx < _ma->_n_other_chunks) {
+        *used = _ma->_other_chunks[_chunk_idx].used;
+        return _ma->_other_chunks[_chunk_idx].buf;
+    }
+    *used = 0;
+    return nullptr;
+}
+
+void memarena::chunk_iterator::next() {
+    _chunk_idx++;
+}
+
+bool memarena::chunk_iterator::more() const {
+    if (_chunk_idx < 0) {
+        return _ma->_current_chunk.buf != nullptr;
+    }
+    return _chunk_idx < _ma->_n_other_chunks;
+}
diff --git a/storage/tokudb/PerconaFT/util/memarena.h b/storage/tokudb/PerconaFT/util/memarena.h
new file mode 100644
index 00000000..c1de3c94
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/memarena.h
@@ -0,0 +1,136 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+/*
+ * A memarena is used to efficiently store a collection of objects that never move
+ * The pattern is allocate more and more stuff and free all of the items at once.
+ * The underlying memory will store 1 or more objects per chunk. Each chunk is 
+ * contiguously laid out in memory but chunks are not necessarily contiguous with
+ * each other.
+ */
+class memarena {
+public:
+    memarena() :
+        _current_chunk(arena_chunk()),
+        _other_chunks(nullptr),
+        _n_other_chunks(0),
+        _size_of_other_chunks(0),
+        _footprint_of_other_chunks(0) {
+    }
+
+    // Effect: Create a memarena with the specified initial size
+    void create(size_t initial_size);
+
+    void destroy(void);
+
+    // Effect: Allocate some memory.  The returned value remains valid until the memarena is cleared or closed.
+    //  In case of ENOMEM, aborts.
+    void *malloc_from_arena(size_t size);
+
+    // Effect: Move all the memory from this memarena into DEST. 
+    //         When SOURCE is closed the memory won't be freed. 
+    //         When DEST is closed, the memory will be freed, unless DEST moves its memory to another memarena...
+    void move_memory(memarena *dest);
+
+    // Effect: Calculate the amount of memory used by a memory arena.
+    size_t total_memory_size(void) const;
+
+    // Effect: Calculate the used space of the memory arena (ie: excludes unused space)
+    size_t total_size_in_use(void) const;
+
+    // Effect: Calculate the amount of memory used, according to toku_memory_footprint(),
+    //         which is a more expensive but more accurate count of memory used.
+    size_t total_footprint(void) const;
+
+    // iterator over the underlying chunks that store objects in the memarena.
+    // a chunk is represented by a pointer to const memory and a usable byte count.
+    class chunk_iterator {
+    public:
+        chunk_iterator(const memarena *ma) :
+            _ma(ma), _chunk_idx(-1) {
+        }
+
+        // returns: base pointer to the current chunk
+        //          *used set to the number of usable bytes
+        //          if more() is false, returns nullptr and *used = 0
+        const void *current(size_t *used) const;
+
+        // requires: more() is true
+        void next();
+
+        bool more() const;
+
+    private:
+        // -1 represents the 'initial' chunk in a memarena, ie: ma->_current_chunk
+        // >= 0 represents the i'th chunk in the ma->_other_chunks array
+        const memarena *_ma;
+        int _chunk_idx;
+    };
+
+private:
+    struct arena_chunk {
+        arena_chunk() : buf(nullptr), used(0), size(0) { }
+        char *buf;
+        size_t used;
+        size_t size;
+    };
+
+    struct arena_chunk _current_chunk;
+    struct arena_chunk *_other_chunks;
+    int _n_other_chunks;
+    size_t _size_of_other_chunks; // the buf_size of all the other chunks.
+    size_t _footprint_of_other_chunks; // the footprint of all the other chunks.
+
+    friend class memarena_unit_test;
+};
diff --git a/storage/tokudb/PerconaFT/util/mempool.cc b/storage/tokudb/PerconaFT/util/mempool.cc
new file mode 100644
index 00000000..b27be71c
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/mempool.cc
@@ -0,0 +1,197 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <string.h>
+#include <memory.h>
+#include <toku_assert.h>
+#include "mempool.h"
+
+/* Contract:
+ * Caller allocates mempool struct as convenient for caller, but memory used for data storage
+ * must be dynamically allocated via toku_malloc().
+ * Caller dynamically allocates memory for mempool and initializes mempool by calling toku_mempool_init().
+ * Once a buffer is assigned to a mempool (via toku_mempool_init()), the mempool owns it and
+ * is responsible for destroying it when the mempool is destroyed.
+ * Caller destroys mempool by calling toku_mempool_destroy().
+ *
+ * Note, toku_mempool_init() does not allocate the memory because sometimes the caller will already have
+ * the memory allocated and will assign the pre-allocated memory to the mempool.
+ */
+
+/* This is a constructor to be used when the memory for the mempool struct has been
+ * allocated by the caller, but no memory has yet been allocatd for the data.
+ */
+void toku_mempool_zero(struct mempool *mp) {
+    // printf("mempool_zero %p\n", mp);
+    memset(mp, 0, sizeof(*mp));
+}
+
+// TODO 4050 this is dirty, try to replace all uses of this
+void toku_mempool_init(struct mempool *mp, void *base, size_t free_offset, size_t size) {
+    // printf("mempool_init %p %p %lu\n", mp, base, size);
+    paranoid_invariant(base != 0);
+    paranoid_invariant(size < (1U<<31)); // used to be assert(size >= 0), but changed to size_t so now let's make sure it's not more than 2GB...
+    paranoid_invariant(free_offset <= size);
+    mp->base = base;
+    mp->size = size;
+    mp->free_offset = free_offset;             // address of first available memory
+    mp->frag_size = 0;               // byte count of wasted space (formerly used, no longer used or available)
+}
+
+/* allocate memory and construct mempool
+ */
+void toku_mempool_construct(struct mempool *mp, size_t data_size) {
+    if (data_size) {
+        // add 25% slack
+        size_t mp_size = data_size + (data_size / 4);
+        mp->base = toku_xmalloc_aligned(64, mp_size);
+        mp->size = mp_size;
+        mp->free_offset = 0;
+        mp->frag_size = 0;
+    }
+    else {
+        toku_mempool_zero(mp);
+    }
+}
+
+void toku_mempool_reset(struct mempool *mp) {
+    mp->free_offset = 0;
+    mp->frag_size = 0;
+}
+
+void toku_mempool_realloc_larger(struct mempool *mp, size_t data_size) {
+    invariant(data_size >= mp->free_offset);
+
+    size_t mpsize = data_size + (data_size/4);     // allow 1/4 room for expansion (would be wasted if read-only)
+    void* newmem = toku_xmalloc_aligned(64, mpsize);   // allocate new buffer for mempool
+    memcpy(newmem, mp->base, mp->free_offset);  // Copy old info
+    toku_free(mp->base);
+    mp->base = newmem;
+    mp->size = mpsize;
+}
+
+
+void toku_mempool_destroy(struct mempool *mp) {
+    // printf("mempool_destroy %p %p %lu %lu\n", mp, mp->base, mp->size, mp->frag_size);
+    if (mp->base)
+        toku_free(mp->base);
+    toku_mempool_zero(mp);
+}
+
+void *toku_mempool_get_base(const struct mempool *mp) {
+    return mp->base;
+}
+
+void *toku_mempool_get_pointer_from_base_and_offset(const struct mempool *mp, size_t offset) {
+    return reinterpret_cast<void*>(reinterpret_cast<char*>(mp->base) + offset);
+}
+
+size_t toku_mempool_get_offset_from_pointer_and_base(const struct mempool *mp, const void* p) {
+    paranoid_invariant(p >= mp->base);
+    return reinterpret_cast<const char*>(p) - reinterpret_cast<const char*>(mp->base);
+}
+
+size_t toku_mempool_get_size(const struct mempool *mp) {
+    return mp->size;
+}
+
+size_t toku_mempool_get_frag_size(const struct mempool *mp) {
+    return mp->frag_size;
+}
+
+size_t toku_mempool_get_used_size(const struct mempool *mp) {
+    return mp->free_offset - mp->frag_size;
+}
+
+void* toku_mempool_get_next_free_ptr(const struct mempool *mp) {
+    return toku_mempool_get_pointer_from_base_and_offset(mp, mp->free_offset);
+}
+
+size_t toku_mempool_get_offset_limit(const struct mempool *mp) {
+    return mp->free_offset;
+}
+
+size_t toku_mempool_get_free_size(const struct mempool *mp) {
+    return mp->size - mp->free_offset;
+}
+
+size_t toku_mempool_get_allocated_size(const struct mempool *mp) {
+    return mp->free_offset;
+}
+
+void *toku_mempool_malloc(struct mempool *mp, size_t size) {
+    paranoid_invariant(size < (1U<<31));
+    paranoid_invariant(mp->size < (1U<<31));
+    paranoid_invariant(mp->free_offset < (1U<<31));
+    paranoid_invariant(mp->free_offset <= mp->size);
+    void *vp;
+    if (mp->free_offset + size > mp->size) {
+        vp = nullptr;
+    } else {
+        vp = reinterpret_cast<char *>(mp->base) + mp->free_offset;
+        mp->free_offset += size;
+    }
+    paranoid_invariant(mp->free_offset <= mp->size);
+    paranoid_invariant(vp == 0 || toku_mempool_inrange(mp, vp, size));
+    return vp;
+}
+
+// if vp is null then we are freeing something, but not specifying what.  The data won't be freed until compression is done.
+void toku_mempool_mfree(struct mempool *mp, void *vp, size_t size) {
+    if (vp) { paranoid_invariant(toku_mempool_inrange(mp, vp, size)); }
+    mp->frag_size += size;
+    invariant(mp->frag_size <= mp->free_offset);
+    invariant(mp->frag_size <= mp->size);
+}
+
+
+/* get memory footprint */
+size_t toku_mempool_footprint(struct mempool *mp) {
+    void * base = mp->base;
+    size_t touched = mp->free_offset;
+    size_t rval = toku_memory_footprint(base, touched);
+    return rval;
+}
+
+void toku_mempool_clone(const struct mempool* orig_mp, struct mempool* new_mp) {
+    new_mp->frag_size = orig_mp->frag_size;
+    new_mp->free_offset = orig_mp->free_offset;
+    new_mp->size = orig_mp->free_offset; // only make the cloned mempool store what is needed
+    new_mp->base = toku_xmalloc_aligned(64, new_mp->size);
+    memcpy(new_mp->base, orig_mp->base, new_mp->size);
+}
diff --git a/storage/tokudb/PerconaFT/util/mempool.h b/storage/tokudb/PerconaFT/util/mempool.h
new file mode 100644
index 00000000..feafdc17
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/mempool.h
@@ -0,0 +1,129 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+/* a memory pool is a contiguous region of memory that supports single
+   allocations from the pool.  these allocated regions are never recycled.
+   when the memory pool no longer has free space, the allocated chunks
+   must be relocated by the application to a new memory pool. */
+
+#include <stddef.h>
+
+struct mempool;
+
+  // TODO 4050 Hide mempool struct internals from callers
+
+struct mempool {
+    void *base;           /* the base address of the memory */
+    size_t free_offset;      /* the offset of the memory pool free space */
+    size_t size;             /* the size of the memory */
+    size_t frag_size;        /* the size of the fragmented memory */
+};
+
+/* This is a constructor to be used when the memory for the mempool struct has been
+ * allocated by the caller, but no memory has yet been allocatd for the data.
+ */
+void toku_mempool_zero(struct mempool *mp);
+
+/* initialize the memory pool with the base address and size of a
+   contiguous chunk of memory */
+void toku_mempool_init(struct mempool *mp, void *base, size_t free_offset, size_t size);
+
+/* allocate memory and construct mempool
+ */
+void toku_mempool_construct(struct mempool *mp, size_t data_size);
+
+/* treat mempool as if it has just been created; ignore any frag and start allocating from beginning again.
+ */
+void toku_mempool_reset(struct mempool *mp);
+
+/* reallocate memory for construct mempool
+ */
+void toku_mempool_realloc_larger(struct mempool *mp, size_t data_size);
+
+/* destroy the memory pool */
+void toku_mempool_destroy(struct mempool *mp);
+
+/* get the base address of the memory pool */
+void *toku_mempool_get_base(const struct mempool *mp);
+
+/* get the a pointer that is offset bytes in front of base of the memory pool */
+void *toku_mempool_get_pointer_from_base_and_offset(const struct mempool *mp, size_t offset);
+
+/* get the offset from base of a pointer */
+size_t toku_mempool_get_offset_from_pointer_and_base(const struct mempool *mp, const void* p);
+
+/* get the a pointer of the first free byte (if any) */
+void* toku_mempool_get_next_free_ptr(const struct mempool *mp);
+
+/* get the limit of valid offsets.  (anything later was not allocated) */
+size_t toku_mempool_get_offset_limit(const struct mempool *mp);
+
+/* get the size of the memory pool */
+size_t toku_mempool_get_size(const struct mempool *mp);
+
+/* get the amount of fragmented (wasted) space in the memory pool */
+size_t toku_mempool_get_frag_size(const struct mempool *mp);
+
+/* get the amount of space that is holding useful data */
+size_t toku_mempool_get_used_size(const struct mempool *mp);
+
+/* get the amount of space that is available for new data */
+size_t toku_mempool_get_free_size(const struct mempool *mp);
+
+/* get the amount of space that has been allocated for use (wasted or not) */
+size_t toku_mempool_get_allocated_size(const struct mempool *mp);
+
+/* allocate a chunk of memory from the memory pool */
+void *toku_mempool_malloc(struct mempool *mp, size_t size);
+
+/* free a previously allocated chunk of memory.  the free only updates
+   a count of the amount of free space in the memory pool.  the memory
+   pool does not keep track of the locations of the free chunks */
+void toku_mempool_mfree(struct mempool *mp, void *vp, size_t size);
+
+/* verify that a memory range is contained within a mempool */
+static inline int toku_mempool_inrange(struct mempool *mp, void *vp, size_t size) {
+    return (mp->base <= vp) && ((char *)vp + size <= (char *)mp->base + mp->size);
+}
+
+/* get memory footprint */
+size_t toku_mempool_footprint(struct mempool *mp);
+
+void toku_mempool_clone(const struct mempool* orig_mp, struct mempool* new_mp);
diff --git a/storage/tokudb/PerconaFT/util/minicron.cc b/storage/tokudb/PerconaFT/util/minicron.cc
new file mode 100644
index 00000000..241e498c
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/minicron.cc
@@ -0,0 +1,201 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <toku_portability.h>
+#include <errno.h>
+#include <string.h>
+
+#include "portability/toku_assert.h"
+#include "util/minicron.h"
+
+toku_instr_key *minicron_p_mutex_key;
+toku_instr_key *minicron_p_condvar_key;
+toku_instr_key *minicron_thread_key;
+
+static void toku_gettime(toku_timespec_t *a) {
+    struct timeval tv;
+    gettimeofday(&tv, 0);
+    a->tv_sec  = tv.tv_sec;
+    a->tv_nsec = tv.tv_usec * 1000LL;
+}
+    
+
+static int
+timespec_compare (toku_timespec_t *a, toku_timespec_t *b) {
+    if (a->tv_sec > b->tv_sec) return 1;
+    if (a->tv_sec < b->tv_sec) return -1;
+    if (a->tv_nsec > b->tv_nsec) return 1;
+    if (a->tv_nsec < b->tv_nsec) return -1;
+    return 0;
+}
+
+// Implementation notes:
+//  When calling do_shutdown or change_period, the mutex is obtained, the variables in the minicron struct are modified, and
+//  the condition variable is signalled.  Possibly the minicron thread will miss the signal.  To avoid this problem, whenever
+//  the minicron thread acquires the mutex, it must check to see what the variables say to do (e.g., should it shut down?).
+
+static void*
+minicron_do (void *pv)
+{
+    struct minicron *CAST_FROM_VOIDP(p, pv);
+    toku_mutex_lock(&p->mutex);
+    while (1) {
+        if (p->do_shutdown) {
+            toku_mutex_unlock(&p->mutex);
+            toku_instr_delete_current_thread();
+            return toku_pthread_done(nullptr);
+        }
+        if (p->period_in_ms == 0) {
+            // if we aren't supposed to do it then just do an untimed wait.
+            toku_cond_wait(&p->condvar, &p->mutex);
+        } 
+        else if (p->period_in_ms <= 1000) {
+            uint32_t period_in_ms = p->period_in_ms;
+            toku_mutex_unlock(&p->mutex);
+            usleep(period_in_ms * 1000);
+            toku_mutex_lock(&p->mutex);
+        }
+        else {
+            // Recompute the wakeup time every time (instead of once per call to f) in case the period changges.
+            toku_timespec_t wakeup_at = p->time_of_last_call_to_f;
+            wakeup_at.tv_sec += (p->period_in_ms/1000);
+            wakeup_at.tv_nsec += (p->period_in_ms % 1000) * 1000000;
+            toku_timespec_t now;
+            toku_gettime(&now);
+            int compare = timespec_compare(&wakeup_at, &now);
+            // if the time to wakeup has yet to come, then we sleep
+            // otherwise, we continue
+            if (compare > 0) {
+                int r = toku_cond_timedwait(&p->condvar, &p->mutex, &wakeup_at);
+                if (r!=0 && r!=ETIMEDOUT) fprintf(stderr, "%s:%d r=%d (%s)", __FILE__, __LINE__, r, strerror(r));
+                assert(r==0 || r==ETIMEDOUT);
+            }
+        }
+        // Now we woke up, and we should figure out what to do
+        if (p->do_shutdown) {
+            toku_mutex_unlock(&p->mutex);
+            toku_instr_delete_current_thread();
+            return toku_pthread_done(nullptr);
+        }
+        if (p->period_in_ms > 1000) {
+            toku_timespec_t now;
+            toku_gettime(&now);
+            toku_timespec_t time_to_call = p->time_of_last_call_to_f;
+            time_to_call.tv_sec += p->period_in_ms/1000;
+            time_to_call.tv_nsec += (p->period_in_ms % 1000) * 1000000;
+            int compare = timespec_compare(&time_to_call, &now);
+            if (compare <= 0) {
+                toku_gettime(&p->time_of_last_call_to_f); // the measured period includes the time to make the call.
+                toku_mutex_unlock(&p->mutex);
+                int r = p->f(p->arg);
+                assert(r==0);
+                toku_mutex_lock(&p->mutex);
+                
+            }
+        }
+        else if (p->period_in_ms != 0) {
+            toku_mutex_unlock(&p->mutex);
+            int r = p->f(p->arg);
+            assert(r==0);
+            toku_mutex_lock(&p->mutex);
+        }
+    }
+}
+
+int
+toku_minicron_setup(struct minicron *p, uint32_t period_in_ms, int(*f)(void *), void *arg)
+{
+    p->f = f;
+    p->arg = arg;
+    toku_gettime(&p->time_of_last_call_to_f);
+    // printf("now=%.6f", p->time_of_last_call_to_f.tv_sec +
+    // p->time_of_last_call_to_f.tv_nsec*1e-9);
+    p->period_in_ms = period_in_ms;
+    p->do_shutdown = false;
+    toku_mutex_init(*minicron_p_mutex_key, &p->mutex, nullptr);
+    toku_cond_init(*minicron_p_condvar_key, &p->condvar, nullptr);
+    return toku_pthread_create(
+        *minicron_thread_key, &p->thread, nullptr, minicron_do, p);
+}
+
+void toku_minicron_change_period(struct minicron *p, uint32_t new_period) {
+    toku_mutex_lock(&p->mutex);
+    p->period_in_ms = new_period;
+    toku_cond_signal(&p->condvar);
+    toku_mutex_unlock(&p->mutex);
+}
+
+/* unlocked function for use by engine status which takes no locks */
+uint32_t
+toku_minicron_get_period_in_seconds_unlocked(struct minicron *p)
+{
+    uint32_t retval = p->period_in_ms/1000;
+    return retval;
+}
+
+/* unlocked function for use by engine status which takes no locks */
+uint32_t
+toku_minicron_get_period_in_ms_unlocked(struct minicron *p)
+{
+    uint32_t retval = p->period_in_ms;
+    return retval;
+}
+
+int
+toku_minicron_shutdown(struct minicron *p) {
+    toku_mutex_lock(&p->mutex);
+    assert(!p->do_shutdown);
+    p->do_shutdown = true;
+    //printf("%s:%d signalling\n", __FILE__, __LINE__);
+    toku_cond_signal(&p->condvar);
+    toku_mutex_unlock(&p->mutex);
+    void *returned_value;
+    //printf("%s:%d joining\n", __FILE__, __LINE__);
+    int r = toku_pthread_join(p->thread, &returned_value);
+    if (r!=0) fprintf(stderr, "%s:%d r=%d (%s)\n", __FILE__, __LINE__, r, strerror(r));
+    assert(r==0);  assert(returned_value==0);
+    toku_cond_destroy(&p->condvar);
+    toku_mutex_destroy(&p->mutex);
+    //printf("%s:%d shutdowned\n", __FILE__, __LINE__);
+    return 0;
+}
+
+bool
+toku_minicron_has_been_shutdown(struct minicron *p) {
+    return p->do_shutdown;
+}
diff --git a/storage/tokudb/PerconaFT/util/minicron.h b/storage/tokudb/PerconaFT/util/minicron.h
new file mode 100644
index 00000000..b5b19bb1
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/minicron.h
@@ -0,0 +1,74 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <toku_pthread.h>
+#include <toku_time.h>
+
+// Specification:
+// A minicron is a miniature cron job for executing a job periodically inside a pthread.
+// To create a minicron,
+//   1) allocate a "struct minicron" somewhere.
+//      Rationale:  This struct can be stored inside another struct (such as the cachetable), avoiding a malloc/free pair.
+//   2) call toku_minicron_setup, specifying a period (in milliseconds), a function, and some arguments.
+//      If the period is positive then the function is called periodically (with the period specified)
+//      Note: The period is measured from when the previous call to f finishes to when the new call starts.
+//            Thus, if the period is 5 minutes, and it takes 8 minutes to run f, then the actual periodicity is 13 minutes.
+//      Rationale:  If f always takes longer than f to run, then it will get "behind".  This module makes getting behind explicit.
+//   3) When finished, call toku_minicron_shutdown.
+//   4) If you want to change the period, then call toku_minicron_change_period.    The time since f finished is applied to the new period
+//      and the call is rescheduled.  (If the time since f finished is more than the new period, then f is called immediately).
+
+struct minicron {
+    toku_pthread_t thread;
+    toku_timespec_t time_of_last_call_to_f;
+    toku_mutex_t mutex;
+    toku_cond_t  condvar;
+    int (*f)(void*);
+    void *arg;
+    uint32_t period_in_ms;
+    bool      do_shutdown;
+};
+
+int toku_minicron_setup (struct minicron *s, uint32_t period_in_ms, int(*f)(void *), void *arg);
+void toku_minicron_change_period(struct minicron *p, uint32_t new_period);
+uint32_t toku_minicron_get_period_in_seconds_unlocked(struct minicron *p);
+uint32_t toku_minicron_get_period_in_ms_unlocked(struct minicron *p);
+int toku_minicron_shutdown(struct minicron *p);
+bool toku_minicron_has_been_shutdown(struct minicron *p);
diff --git a/storage/tokudb/PerconaFT/util/nb_mutex.h b/storage/tokudb/PerconaFT/util/nb_mutex.h
new file mode 100644
index 00000000..d777961a
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/nb_mutex.h
@@ -0,0 +1,136 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "rwlock.h"
+
+//Use case:
+// General purpose non blocking mutex with properties:
+// 1. one writer at a time
+
+// An external mutex must be locked when using these functions.  An alternate
+// design would bury a mutex into the nb_mutex itself.  While this may
+// increase parallelism at the expense of single thread performance, we
+// are experimenting with a single higher level lock.
+
+extern toku_instr_key *nb_mutex_key;
+
+typedef struct nb_mutex *NB_MUTEX;
+struct nb_mutex {
+    struct st_rwlock lock;
+#if defined(TOKU_MYSQL_WITH_PFS)
+    toku_mutex_t toku_mutex;
+#endif
+};
+
+#if defined(TOKU_MYSQL_WITH_PFS)
+#define nb_mutex_init(MK, RK, M)                                 \
+    inline_nb_mutex_init(MK, RK, M)
+#else
+#define nb_mutex_init(MK, RK, M) inline_nb_mutex_init(M)
+#endif
+
+// initialize an nb mutex
+inline void inline_nb_mutex_init(
+#if defined(TOKU_MYSQL_WITH_PFS)
+    const toku_instr_key &mutex_instr_key,
+    const toku_instr_key &rwlock_instr_key,
+#endif
+    NB_MUTEX nb_mutex) {
+#if defined(TOKU_MYSQL_WITH_PFS)
+    toku_mutex_init(mutex_instr_key, &nb_mutex->toku_mutex, nullptr);
+#endif
+    rwlock_init(rwlock_instr_key, &nb_mutex->lock);
+}
+
+// destroy a read write lock
+inline void nb_mutex_destroy(NB_MUTEX nb_mutex) {
+#if defined(TOKU_MYSQL_WITH_PFS)
+    toku_instr_mutex_destroy(nb_mutex->toku_mutex.psi_mutex);
+#endif
+    rwlock_destroy(&nb_mutex->lock);
+}
+
+// obtain a write lock
+// expects: mutex is locked
+inline void nb_mutex_lock(NB_MUTEX nb_mutex, toku_mutex_t *mutex) {
+#ifdef TOKU_MYSQL_WITH_PFS
+    toku_mutex_instrumentation mutex_instr;
+    toku_instr_mutex_lock_start(mutex_instr,
+                                *mutex,
+                                __FILE__,
+                                __LINE__);  // TODO: pull these to caller?
+#endif
+    rwlock_write_lock(&nb_mutex->lock, mutex);
+#if defined(TOKU_MYSQL_WITH_PFS)
+    toku_instr_mutex_lock_end(mutex_instr, 0);
+#endif
+}
+
+// release a write lock
+// expects: mutex is locked
+
+inline void nb_mutex_unlock(NB_MUTEX nb_mutex) {
+#if defined(TOKU_MYSQL_WITH_PFS)
+    toku_instr_mutex_unlock(nb_mutex->toku_mutex.psi_mutex);
+#endif
+    rwlock_write_unlock(&nb_mutex->lock);
+}
+
+static inline void nb_mutex_wait_for_users(NB_MUTEX nb_mutex, toku_mutex_t *mutex) {
+    rwlock_wait_for_users(&nb_mutex->lock, mutex);
+}
+
+// returns: the number of writers who are waiting for the lock
+
+static inline int nb_mutex_blocked_writers(NB_MUTEX nb_mutex) {
+    return rwlock_blocked_writers(&nb_mutex->lock);
+}
+
+// returns: the number of writers
+
+static inline int nb_mutex_writers(NB_MUTEX nb_mutex) {
+    return rwlock_writers(&nb_mutex->lock);
+}
+
+// returns: the sum of the number of readers, pending readers,
+// writers, and pending writers
+static inline int nb_mutex_users(NB_MUTEX nb_mutex) {
+    return rwlock_users(&nb_mutex->lock);
+}
diff --git a/storage/tokudb/PerconaFT/util/omt.cc b/storage/tokudb/PerconaFT/util/omt.cc
new file mode 100644
index 00000000..44da9847
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/omt.cc
@@ -0,0 +1,1388 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <db.h>
+#include <string.h>
+
+#include <portability/memory.h>
+
+namespace toku {
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::create(void) {
+        this->create_internal(2);
+        if (supports_marks) {
+            this->convert_to_tree();
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::create_no_array(void) {
+        if (!supports_marks) {
+            this->create_internal_no_array(0);
+        } else {
+            this->is_array = false;
+            this->capacity = 0;
+            this->d.t.nodes = nullptr;
+            this->d.t.root.set_to_null();
+            this->d.t.free_idx = 0;
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::create_from_sorted_array(
+        const omtdata_t *const values,
+        const uint32_t numvalues) {
+        this->create_internal(numvalues);
+        memcpy(this->d.a.values, values, numvalues * (sizeof values[0]));
+        this->d.a.num_values = numvalues;
+        if (supports_marks) {
+            this->convert_to_tree();
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void
+    omt<omtdata_t, omtdataout_t, supports_marks>::create_steal_sorted_array(
+        omtdata_t **const values,
+        const uint32_t numvalues,
+        const uint32_t new_capacity) {
+        paranoid_invariant_notnull(values);
+        this->create_internal_no_array(new_capacity);
+        this->d.a.num_values = numvalues;
+        this->d.a.values = *values;
+        *values = nullptr;
+        if (supports_marks) {
+            this->convert_to_tree();
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::split_at(
+        omt *const newomt,
+        const uint32_t idx) {
+        barf_if_marked(*this);
+        paranoid_invariant_notnull(newomt);
+        if (idx > this->size()) {
+            return EINVAL;
+        }
+        this->convert_to_array();
+        const uint32_t newsize = this->size() - idx;
+        newomt->create_from_sorted_array(
+            &this->d.a.values[this->d.a.start_idx + idx], newsize);
+        this->d.a.num_values = idx;
+        this->maybe_resize_array(idx);
+        if (supports_marks) {
+            this->convert_to_tree();
+        }
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::merge(
+        omt *const leftomt,
+        omt *const rightomt) {
+        barf_if_marked(*this);
+        paranoid_invariant_notnull(leftomt);
+        paranoid_invariant_notnull(rightomt);
+        const uint32_t leftsize = leftomt->size();
+        const uint32_t rightsize = rightomt->size();
+        const uint32_t newsize = leftsize + rightsize;
+
+        if (leftomt->is_array) {
+            if (leftomt->capacity -
+                    (leftomt->d.a.start_idx + leftomt->d.a.num_values) >=
+                rightsize) {
+                this->create_steal_sorted_array(&leftomt->d.a.values,
+                                                leftomt->d.a.num_values,
+                                                leftomt->capacity);
+                this->d.a.start_idx = leftomt->d.a.start_idx;
+            } else {
+                this->create_internal(newsize);
+                memcpy(&this->d.a.values[0],
+                       &leftomt->d.a.values[leftomt->d.a.start_idx],
+                       leftomt->d.a.num_values * (sizeof this->d.a.values[0]));
+            }
+        } else {
+            this->create_internal(newsize);
+            leftomt->fill_array_with_subtree_values(&this->d.a.values[0],
+                                                    leftomt->d.t.root);
+        }
+        leftomt->destroy();
+        this->d.a.num_values = leftsize;
+
+        if (rightomt->is_array) {
+            memcpy(
+                &this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+                &rightomt->d.a.values[rightomt->d.a.start_idx],
+                rightomt->d.a.num_values * (sizeof this->d.a.values[0]));
+        } else {
+            rightomt->fill_array_with_subtree_values(
+                &this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+                rightomt->d.t.root);
+        }
+        rightomt->destroy();
+        this->d.a.num_values += rightsize;
+        paranoid_invariant(this->size() == newsize);
+        if (supports_marks) {
+            this->convert_to_tree();
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::clone(const omt &src) {
+        barf_if_marked(*this);
+        this->create_internal(src.size());
+        if (src.is_array) {
+            memcpy(&this->d.a.values[0],
+                   &src.d.a.values[src.d.a.start_idx],
+                   src.d.a.num_values * (sizeof this->d.a.values[0]));
+        } else {
+            src.fill_array_with_subtree_values(&this->d.a.values[0],
+                                               src.d.t.root);
+        }
+        this->d.a.num_values = src.size();
+        if (supports_marks) {
+            this->convert_to_tree();
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::clear(void) {
+        if (this->is_array) {
+            this->d.a.start_idx = 0;
+            this->d.a.num_values = 0;
+        } else {
+            this->d.t.root.set_to_null();
+            this->d.t.free_idx = 0;
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::destroy(void) {
+        this->clear();
+        this->capacity = 0;
+        if (this->is_array) {
+            if (this->d.a.values != nullptr) {
+                toku_free(this->d.a.values);
+            }
+            this->d.a.values = nullptr;
+        } else {
+            if (this->d.t.nodes != nullptr) {
+                toku_free(this->d.t.nodes);
+            }
+            this->d.t.nodes = nullptr;
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::size(void) const {
+        if (this->is_array) {
+            return this->d.a.num_values;
+        } else {
+            return this->nweight(this->d.t.root);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::insert(
+        const omtdata_t &value,
+        const omtcmp_t &v,
+        uint32_t *const idx) {
+        int r;
+        uint32_t insert_idx;
+
+        r = this->find_zero<omtcmp_t, h>(v, nullptr, &insert_idx);
+        if (r == 0) {
+            if (idx)
+                *idx = insert_idx;
+            return DB_KEYEXIST;
+        }
+        if (r != DB_NOTFOUND)
+            return r;
+
+        if ((r = this->insert_at(value, insert_idx)))
+            return r;
+        if (idx)
+            *idx = insert_idx;
+
+        return 0;
+    }
+
+    // The following 3 functions implement a static if for us.
+    template <typename omtdata_t, typename omtdataout_t>
+    static void barf_if_marked(
+        const omt<omtdata_t, omtdataout_t, false> &UU(omt)) {}
+
+    template <typename omtdata_t, typename omtdataout_t>
+    static void barf_if_marked(const omt<omtdata_t, omtdataout_t, true> &omt) {
+        invariant(!omt.has_marks());
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    bool omt<omtdata_t, omtdataout_t, supports_marks>::has_marks(void) const {
+        static_assert(supports_marks, "Does not support marks");
+        if (this->d.t.root.is_null()) {
+            return false;
+        }
+        const omt_node &node = this->d.t.nodes[this->d.t.root.get_index()];
+        return node.get_marks_below() || node.get_marked();
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::insert_at(
+        const omtdata_t &value,
+        const uint32_t idx) {
+        barf_if_marked(*this);
+        if (idx > this->size()) {
+            return EINVAL;
+        }
+
+        this->maybe_resize_or_convert(this->size() + 1);
+        if (this->is_array && idx != this->d.a.num_values &&
+            (idx != 0 || this->d.a.start_idx == 0)) {
+            this->convert_to_tree();
+        }
+        if (this->is_array) {
+            if (idx == this->d.a.num_values) {
+                this->d.a.values[this->d.a.start_idx + this->d.a.num_values] =
+                    value;
+            } else {
+                this->d.a.values[--this->d.a.start_idx] = value;
+            }
+            this->d.a.num_values++;
+        } else {
+            subtree *rebalance_subtree = nullptr;
+            this->insert_internal(
+                &this->d.t.root, value, idx, &rebalance_subtree);
+            if (rebalance_subtree != nullptr) {
+                this->rebalance(rebalance_subtree);
+            }
+        }
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::set_at(
+        const omtdata_t &value,
+        const uint32_t idx) {
+        barf_if_marked(*this);
+        if (idx >= this->size()) {
+            return EINVAL;
+        }
+
+        if (this->is_array) {
+            this->set_at_internal_array(value, idx);
+        } else {
+            this->set_at_internal(this->d.t.root, value, idx);
+        }
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::delete_at(
+        const uint32_t idx) {
+        barf_if_marked(*this);
+        if (idx >= this->size()) {
+            return EINVAL;
+        }
+
+        this->maybe_resize_or_convert(this->size() - 1);
+        if (this->is_array && idx != 0 && idx != this->d.a.num_values - 1) {
+            this->convert_to_tree();
+        }
+        if (this->is_array) {
+            // Testing for 0 does not rule out it being the last entry.
+            // Test explicitly for num_values-1
+            if (idx != this->d.a.num_values - 1) {
+                this->d.a.start_idx++;
+            }
+            this->d.a.num_values--;
+        } else {
+            subtree *rebalance_subtree = nullptr;
+            this->delete_internal(
+                &this->d.t.root, idx, nullptr, &rebalance_subtree);
+            if (rebalance_subtree != nullptr) {
+                this->rebalance(rebalance_subtree);
+            }
+        }
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <
+        typename iterate_extra_t,
+        int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::iterate(
+        iterate_extra_t *const iterate_extra) const {
+        return this->iterate_on_range<iterate_extra_t, f>(
+            0, this->size(), iterate_extra);
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <
+        typename iterate_extra_t,
+        int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_on_range(
+        const uint32_t left,
+        const uint32_t right,
+        iterate_extra_t *const iterate_extra) const {
+        if (right > this->size()) {
+            return EINVAL;
+        }
+        if (left == right) {
+            return 0;
+        }
+        if (this->is_array) {
+            return this->iterate_internal_array<iterate_extra_t, f>(
+                left, right, iterate_extra);
+        }
+        return this->iterate_internal<iterate_extra_t, f>(
+            left, right, this->d.t.root, 0, iterate_extra);
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <
+        typename iterate_extra_t,
+        int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_and_mark_range(
+        const uint32_t left,
+        const uint32_t right,
+        iterate_extra_t *const iterate_extra) {
+        static_assert(supports_marks, "does not support marks");
+        if (right > this->size()) {
+            return EINVAL;
+        }
+        if (left == right) {
+            return 0;
+        }
+        paranoid_invariant(!this->is_array);
+        return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+            left, right, this->d.t.root, 0, iterate_extra);
+    }
+
+    // TODO: We can optimize this if we steal 3 bits.  1 bit: this node is
+    // marked.  1 bit: left subtree has marks. 1 bit: right subtree has marks.
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <
+        typename iterate_extra_t,
+        int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked(
+        iterate_extra_t *const iterate_extra) const {
+        static_assert(supports_marks, "does not support marks");
+        paranoid_invariant(!this->is_array);
+        return this->iterate_over_marked_internal<iterate_extra_t, f>(
+            this->d.t.root, 0, iterate_extra);
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::unmark(
+        const subtree &st,
+        const uint32_t index,
+        GrowableArray<node_idx> *const indexes) {
+        if (st.is_null()) {
+            return;
+        }
+        omt_node &n = this->d.t.nodes[st.get_index()];
+        const uint32_t index_root = index + this->nweight(n.left);
+
+        const bool below = n.get_marks_below();
+        if (below) {
+            this->unmark(n.left, index, indexes);
+        }
+        if (n.get_marked()) {
+            indexes->push(index_root);
+        }
+        n.clear_stolen_bits();
+        if (below) {
+            this->unmark(n.right, index_root + 1, indexes);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::delete_all_marked(void) {
+        static_assert(supports_marks, "does not support marks");
+        if (!this->has_marks()) {
+            return;
+        }
+        paranoid_invariant(!this->is_array);
+        GrowableArray<node_idx> marked_indexes;
+        marked_indexes.init();
+
+        // Remove all marks.
+        // We need to delete all the stolen bits before calling delete_at to
+        // prevent barfing.
+        this->unmark(this->d.t.root, 0, &marked_indexes);
+
+        for (uint32_t i = 0; i < marked_indexes.get_size(); i++) {
+            // Delete from left to right, shift by number already deleted.
+            // Alternative is delete from right to left.
+            int r = this->delete_at(marked_indexes.fetch_unchecked(i) - i);
+            lazy_assert_zero(r);
+        }
+        marked_indexes.deinit();
+        barf_if_marked(*this);
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::
+        verify_marks_consistent_internal(const subtree &st,
+                                         const bool UU(allow_marks)) const {
+        if (st.is_null()) {
+            return 0;
+        }
+        const omt_node &node = this->d.t.nodes[st.get_index()];
+        uint32_t num_marks =
+            verify_marks_consistent_internal(node.left, node.get_marks_below());
+        num_marks += verify_marks_consistent_internal(node.right,
+                                                      node.get_marks_below());
+        if (node.get_marks_below()) {
+            paranoid_invariant(allow_marks);
+            paranoid_invariant(num_marks > 0);
+        } else {
+            // redundant with invariant below, but nice to have explicitly
+            paranoid_invariant(num_marks == 0);
+        }
+        if (node.get_marked()) {
+            paranoid_invariant(allow_marks);
+            ++num_marks;
+        }
+        return num_marks;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent(
+        void) const {
+        static_assert(supports_marks, "does not support marks");
+        paranoid_invariant(!this->is_array);
+        this->verify_marks_consistent_internal(this->d.t.root, true);
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename iterate_extra_t,
+              int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr(
+        iterate_extra_t *const iterate_extra) {
+        if (this->is_array) {
+            this->iterate_ptr_internal_array<iterate_extra_t, f>(
+                0, this->size(), iterate_extra);
+        } else {
+            this->iterate_ptr_internal<iterate_extra_t, f>(
+                0, this->size(), this->d.t.root, 0, iterate_extra);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::fetch(
+        const uint32_t idx,
+        omtdataout_t *const value) const {
+        if (idx >= this->size()) {
+            return EINVAL;
+        }
+        if (this->is_array) {
+            this->fetch_internal_array(idx, value);
+        } else {
+            this->fetch_internal(this->d.t.root, idx, value);
+        }
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::find_zero(
+        const omtcmp_t &extra,
+        omtdataout_t *const value,
+        uint32_t *const idxp) const {
+        uint32_t tmp_index;
+        uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+        int r;
+        if (this->is_array) {
+            r = this->find_internal_zero_array<omtcmp_t, h>(
+                extra, value, child_idxp);
+        } else {
+            r = this->find_internal_zero<omtcmp_t, h>(
+                this->d.t.root, extra, value, child_idxp);
+        }
+        return r;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::find(
+        const omtcmp_t &extra,
+        int direction,
+        omtdataout_t *const value,
+        uint32_t *const idxp) const {
+        uint32_t tmp_index;
+        uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+        paranoid_invariant(direction != 0);
+        if (direction < 0) {
+            if (this->is_array) {
+                return this->find_internal_minus_array<omtcmp_t, h>(
+                    extra, value, child_idxp);
+            } else {
+                return this->find_internal_minus<omtcmp_t, h>(
+                    this->d.t.root, extra, value, child_idxp);
+            }
+        } else {
+            if (this->is_array) {
+                return this->find_internal_plus_array<omtcmp_t, h>(
+                    extra, value, child_idxp);
+            } else {
+                return this->find_internal_plus<omtcmp_t, h>(
+                    this->d.t.root, extra, value, child_idxp);
+            }
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    size_t omt<omtdata_t, omtdataout_t, supports_marks>::memory_size(void) {
+        if (this->is_array) {
+            return (sizeof *this) +
+                   this->capacity * (sizeof this->d.a.values[0]);
+        }
+        return (sizeof *this) + this->capacity * (sizeof this->d.t.nodes[0]);
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal_no_array(
+        const uint32_t new_capacity) {
+        this->is_array = true;
+        this->d.a.start_idx = 0;
+        this->d.a.num_values = 0;
+        this->d.a.values = nullptr;
+        this->capacity = new_capacity;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal(
+        const uint32_t new_capacity) {
+        this->create_internal_no_array(new_capacity);
+        XMALLOC_N(this->capacity, this->d.a.values);
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::nweight(
+        const subtree &st) const {
+        if (st.is_null()) {
+            return 0;
+        } else {
+            return this->d.t.nodes[st.get_index()].weight;
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    typename omt<omtdata_t, omtdataout_t, supports_marks>::node_idx
+    omt<omtdata_t, omtdataout_t, supports_marks>::node_malloc(void) {
+        paranoid_invariant(this->d.t.free_idx < this->capacity);
+        omt_node &n = this->d.t.nodes[this->d.t.free_idx];
+        n.clear_stolen_bits();
+        return this->d.t.free_idx++;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::node_free(
+        const node_idx UU(idx)) {
+        paranoid_invariant(idx < this->capacity);
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_array(
+        const uint32_t n) {
+        const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+        const uint32_t room = this->capacity - this->d.a.start_idx;
+
+        if (room < n || this->capacity / 2 >= new_size) {
+            omtdata_t *XMALLOC_N(new_size, tmp_values);
+            memcpy(tmp_values,
+                   &this->d.a.values[this->d.a.start_idx],
+                   this->d.a.num_values * (sizeof tmp_values[0]));
+            this->d.a.start_idx = 0;
+            this->capacity = new_size;
+            toku_free(this->d.a.values);
+            this->d.a.values = tmp_values;
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::
+        fill_array_with_subtree_values(omtdata_t *const array,
+                                       const subtree &st) const {
+        if (st.is_null())
+            return;
+        const omt_node &tree = this->d.t.nodes[st.get_index()];
+        this->fill_array_with_subtree_values(&array[0], tree.left);
+        array[this->nweight(tree.left)] = tree.value;
+        this->fill_array_with_subtree_values(
+            &array[this->nweight(tree.left) + 1], tree.right);
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_array(void) {
+        if (!this->is_array) {
+            const uint32_t num_values = this->size();
+            uint32_t new_size = 2 * num_values;
+            new_size = new_size < 4 ? 4 : new_size;
+
+            omtdata_t *XMALLOC_N(new_size, tmp_values);
+            this->fill_array_with_subtree_values(tmp_values, this->d.t.root);
+            toku_free(this->d.t.nodes);
+            this->is_array = true;
+            this->capacity = new_size;
+            this->d.a.num_values = num_values;
+            this->d.a.values = tmp_values;
+            this->d.a.start_idx = 0;
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void
+    omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_from_sorted_array(
+        subtree *const st,
+        const omtdata_t *const values,
+        const uint32_t numvalues) {
+        if (numvalues == 0) {
+            st->set_to_null();
+        } else {
+            const uint32_t halfway = numvalues / 2;
+            const node_idx newidx = this->node_malloc();
+            omt_node *const newnode = &this->d.t.nodes[newidx];
+            newnode->weight = numvalues;
+            newnode->value = values[halfway];
+            st->set_index(newidx);
+            // update everything before the recursive calls so the second call
+            // can be a tail call.
+            this->rebuild_from_sorted_array(
+                &newnode->left, &values[0], halfway);
+            this->rebuild_from_sorted_array(&newnode->right,
+                                            &values[halfway + 1],
+                                            numvalues - (halfway + 1));
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_tree(void) {
+        if (this->is_array) {
+            const uint32_t num_nodes = this->size();
+            uint32_t new_size = num_nodes * 2;
+            new_size = new_size < 4 ? 4 : new_size;
+
+            omt_node *XMALLOC_N(new_size, new_nodes);
+            omtdata_t *const values = this->d.a.values;
+            omtdata_t *const tmp_values = &values[this->d.a.start_idx];
+            this->is_array = false;
+            this->d.t.nodes = new_nodes;
+            this->capacity = new_size;
+            this->d.t.free_idx = 0;
+            this->d.t.root.set_to_null();
+            this->rebuild_from_sorted_array(
+                &this->d.t.root, tmp_values, num_nodes);
+            toku_free(values);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_or_convert(
+        const uint32_t n) {
+        if (this->is_array) {
+            this->maybe_resize_array(n);
+        } else {
+            const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+            const uint32_t num_nodes = this->nweight(this->d.t.root);
+            if ((this->capacity / 2 >= new_size) ||
+                (this->d.t.free_idx >= this->capacity && num_nodes < n) ||
+                (this->capacity < n)) {
+                this->convert_to_array();
+                // if we had a free list, the "supports_marks" version could
+                // just resize, as it is now, we have to convert to and back
+                // from an array.
+                if (supports_marks) {
+                    this->convert_to_tree();
+                }
+            }
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    bool omt<omtdata_t, omtdataout_t, supports_marks>::will_need_rebalance(
+        const subtree &st,
+        const int leftmod,
+        const int rightmod) const {
+        if (st.is_null()) {
+            return false;
+        }
+        const omt_node &n = this->d.t.nodes[st.get_index()];
+        // one of the 1's is for the root.
+        // the other is to take ceil(n/2)
+        const uint32_t weight_left = this->nweight(n.left) + leftmod;
+        const uint32_t weight_right = this->nweight(n.right) + rightmod;
+        return ((1 + weight_left < (1 + 1 + weight_right) / 2) ||
+                (1 + weight_right < (1 + 1 + weight_left) / 2));
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::insert_internal(
+        subtree *const subtreep,
+        const omtdata_t &value,
+        const uint32_t idx,
+        subtree **const rebalance_subtree) {
+        if (subtreep->is_null()) {
+            paranoid_invariant_zero(idx);
+            const node_idx newidx = this->node_malloc();
+            omt_node *const newnode = &this->d.t.nodes[newidx];
+            newnode->weight = 1;
+            newnode->left.set_to_null();
+            newnode->right.set_to_null();
+            newnode->value = value;
+            subtreep->set_index(newidx);
+        } else {
+            omt_node &n = this->d.t.nodes[subtreep->get_index()];
+            n.weight++;
+            if (idx <= this->nweight(n.left)) {
+                if (*rebalance_subtree == nullptr &&
+                    this->will_need_rebalance(*subtreep, 1, 0)) {
+                    *rebalance_subtree = subtreep;
+                }
+                this->insert_internal(&n.left, value, idx, rebalance_subtree);
+            } else {
+                if (*rebalance_subtree == nullptr &&
+                    this->will_need_rebalance(*subtreep, 0, 1)) {
+                    *rebalance_subtree = subtreep;
+                }
+                const uint32_t sub_index = idx - this->nweight(n.left) - 1;
+                this->insert_internal(
+                    &n.right, value, sub_index, rebalance_subtree);
+            }
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal_array(
+        const omtdata_t &value,
+        const uint32_t idx) {
+        this->d.a.values[this->d.a.start_idx + idx] = value;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal(
+        const subtree &st,
+        const omtdata_t &value,
+        const uint32_t idx) {
+        paranoid_invariant(!st.is_null());
+        omt_node &n = this->d.t.nodes[st.get_index()];
+        const uint32_t leftweight = this->nweight(n.left);
+        if (idx < leftweight) {
+            this->set_at_internal(n.left, value, idx);
+        } else if (idx == leftweight) {
+            n.value = value;
+        } else {
+            this->set_at_internal(n.right, value, idx - leftweight - 1);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::delete_internal(
+        subtree *const subtreep,
+        const uint32_t idx,
+        omt_node *const copyn,
+        subtree **const rebalance_subtree) {
+        paranoid_invariant_notnull(subtreep);
+        paranoid_invariant_notnull(rebalance_subtree);
+        paranoid_invariant(!subtreep->is_null());
+        omt_node &n = this->d.t.nodes[subtreep->get_index()];
+        const uint32_t leftweight = this->nweight(n.left);
+        if (idx < leftweight) {
+            n.weight--;
+            if (*rebalance_subtree == nullptr &&
+                this->will_need_rebalance(*subtreep, -1, 0)) {
+                *rebalance_subtree = subtreep;
+            }
+            this->delete_internal(&n.left, idx, copyn, rebalance_subtree);
+        } else if (idx == leftweight) {
+            if (n.left.is_null()) {
+                const uint32_t oldidx = subtreep->get_index();
+                *subtreep = n.right;
+                if (copyn != nullptr) {
+                    copyn->value = n.value;
+                }
+                this->node_free(oldidx);
+            } else if (n.right.is_null()) {
+                const uint32_t oldidx = subtreep->get_index();
+                *subtreep = n.left;
+                if (copyn != nullptr) {
+                    copyn->value = n.value;
+                }
+                this->node_free(oldidx);
+            } else {
+                if (*rebalance_subtree == nullptr &&
+                    this->will_need_rebalance(*subtreep, 0, -1)) {
+                    *rebalance_subtree = subtreep;
+                }
+                // don't need to copy up value, it's only used by this
+                // next call, and when that gets to the bottom there
+                // won't be any more recursion
+                n.weight--;
+                this->delete_internal(&n.right, 0, &n, rebalance_subtree);
+            }
+        } else {
+            n.weight--;
+            if (*rebalance_subtree == nullptr &&
+                this->will_need_rebalance(*subtreep, 0, -1)) {
+                *rebalance_subtree = subtreep;
+            }
+            this->delete_internal(
+                &n.right, idx - leftweight - 1, copyn, rebalance_subtree);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <
+        typename iterate_extra_t,
+        int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal_array(
+        const uint32_t left,
+        const uint32_t right,
+        iterate_extra_t *const iterate_extra) const {
+        int r;
+        for (uint32_t i = left; i < right; ++i) {
+            r = f(this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+            if (r != 0) {
+                return r;
+            }
+        }
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename iterate_extra_t,
+              int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal(
+        const uint32_t left,
+        const uint32_t right,
+        const subtree &st,
+        const uint32_t idx,
+        iterate_extra_t *const iterate_extra) {
+        if (!st.is_null()) {
+            omt_node &n = this->d.t.nodes[st.get_index()];
+            const uint32_t idx_root = idx + this->nweight(n.left);
+            if (left < idx_root) {
+                this->iterate_ptr_internal<iterate_extra_t, f>(
+                    left, right, n.left, idx, iterate_extra);
+            }
+            if (left <= idx_root && idx_root < right) {
+                int r = f(&n.value, idx_root, iterate_extra);
+                lazy_assert_zero(r);
+            }
+            if (idx_root + 1 < right) {
+                this->iterate_ptr_internal<iterate_extra_t, f>(
+                    left, right, n.right, idx_root + 1, iterate_extra);
+            }
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename iterate_extra_t,
+              int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void
+    omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal_array(
+        const uint32_t left,
+        const uint32_t right,
+        iterate_extra_t *const iterate_extra) {
+        for (uint32_t i = left; i < right; ++i) {
+            int r =
+                f(&this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+            lazy_assert_zero(r);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <
+        typename iterate_extra_t,
+        int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal(
+        const uint32_t left,
+        const uint32_t right,
+        const subtree &st,
+        const uint32_t idx,
+        iterate_extra_t *const iterate_extra) const {
+        if (st.is_null()) {
+            return 0;
+        }
+        int r;
+        const omt_node &n = this->d.t.nodes[st.get_index()];
+        const uint32_t idx_root = idx + this->nweight(n.left);
+        if (left < idx_root) {
+            r = this->iterate_internal<iterate_extra_t, f>(
+                left, right, n.left, idx, iterate_extra);
+            if (r != 0) {
+                return r;
+            }
+        }
+        if (left <= idx_root && idx_root < right) {
+            r = f(n.value, idx_root, iterate_extra);
+            if (r != 0) {
+                return r;
+            }
+        }
+        if (idx_root + 1 < right) {
+            return this->iterate_internal<iterate_extra_t, f>(
+                left, right, n.right, idx_root + 1, iterate_extra);
+        }
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <
+        typename iterate_extra_t,
+        int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::
+        iterate_and_mark_range_internal(const uint32_t left,
+                                        const uint32_t right,
+                                        const subtree &st,
+                                        const uint32_t idx,
+                                        iterate_extra_t *const iterate_extra) {
+        paranoid_invariant(!st.is_null());
+        int r;
+        omt_node &n = this->d.t.nodes[st.get_index()];
+        const uint32_t idx_root = idx + this->nweight(n.left);
+        if (left < idx_root && !n.left.is_null()) {
+            n.set_marks_below_bit();
+            r = this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+                left, right, n.left, idx, iterate_extra);
+            if (r != 0) {
+                return r;
+            }
+        }
+        if (left <= idx_root && idx_root < right) {
+            n.set_marked_bit();
+            r = f(n.value, idx_root, iterate_extra);
+            if (r != 0) {
+                return r;
+            }
+        }
+        if (idx_root + 1 < right && !n.right.is_null()) {
+            n.set_marks_below_bit();
+            return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+                left, right, n.right, idx_root + 1, iterate_extra);
+        }
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <
+        typename iterate_extra_t,
+        int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int
+    omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked_internal(
+        const subtree &st,
+        const uint32_t idx,
+        iterate_extra_t *const iterate_extra) const {
+        if (st.is_null()) {
+            return 0;
+        }
+        int r;
+        const omt_node &n = this->d.t.nodes[st.get_index()];
+        const uint32_t idx_root = idx + this->nweight(n.left);
+        if (n.get_marks_below()) {
+            r = this->iterate_over_marked_internal<iterate_extra_t, f>(
+                n.left, idx, iterate_extra);
+            if (r != 0) {
+                return r;
+            }
+        }
+        if (n.get_marked()) {
+            r = f(n.value, idx_root, iterate_extra);
+            if (r != 0) {
+                return r;
+            }
+        }
+        if (n.get_marks_below()) {
+            return this->iterate_over_marked_internal<iterate_extra_t, f>(
+                n.right, idx_root + 1, iterate_extra);
+        }
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal_array(
+        const uint32_t i,
+        omtdataout_t *const value) const {
+        if (value != nullptr) {
+            copyout(value, &this->d.a.values[this->d.a.start_idx + i]);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal(
+        const subtree &st,
+        const uint32_t i,
+        omtdataout_t *const value) const {
+        omt_node &n = this->d.t.nodes[st.get_index()];
+        const uint32_t leftweight = this->nweight(n.left);
+        if (i < leftweight) {
+            this->fetch_internal(n.left, i, value);
+        } else if (i == leftweight) {
+            if (value != nullptr) {
+                copyout(value, &n);
+            }
+        } else {
+            this->fetch_internal(n.right, i - leftweight - 1, value);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void
+    omt<omtdata_t, omtdataout_t, supports_marks>::fill_array_with_subtree_idxs(
+        node_idx *const array,
+        const subtree &st) const {
+        if (!st.is_null()) {
+            const omt_node &tree = this->d.t.nodes[st.get_index()];
+            this->fill_array_with_subtree_idxs(&array[0], tree.left);
+            array[this->nweight(tree.left)] = st.get_index();
+            this->fill_array_with_subtree_idxs(
+                &array[this->nweight(tree.left) + 1], tree.right);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void
+    omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_subtree_from_idxs(
+        subtree *const st,
+        const node_idx *const idxs,
+        const uint32_t numvalues) {
+        if (numvalues == 0) {
+            st->set_to_null();
+        } else {
+            uint32_t halfway = numvalues / 2;
+            st->set_index(idxs[halfway]);
+            // node_idx newidx = idxs[halfway];
+            omt_node &newnode = this->d.t.nodes[st->get_index()];
+            newnode.weight = numvalues;
+            // value is already in there.
+            this->rebuild_subtree_from_idxs(&newnode.left, &idxs[0], halfway);
+            this->rebuild_subtree_from_idxs(
+                &newnode.right, &idxs[halfway + 1], numvalues - (halfway + 1));
+            // n_idx = newidx;
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::rebalance(
+        subtree *const st) {
+        node_idx idx = st->get_index();
+        if (idx == this->d.t.root.get_index()) {
+            // Try to convert to an array.
+            // If this fails, (malloc) nothing will have changed.
+            // In the failure case we continue on to the standard rebalance
+            // algorithm.
+            this->convert_to_array();
+            if (supports_marks) {
+                this->convert_to_tree();
+            }
+        } else {
+            const omt_node &n = this->d.t.nodes[idx];
+            node_idx *tmp_array;
+            size_t mem_needed = n.weight * (sizeof tmp_array[0]);
+            size_t mem_free = (this->capacity - this->d.t.free_idx) *
+                              (sizeof this->d.t.nodes[0]);
+            bool malloced;
+            if (mem_needed <= mem_free) {
+                // There is sufficient free space at the end of the nodes array
+                // to hold enough node indexes to rebalance.
+                malloced = false;
+                tmp_array = reinterpret_cast<node_idx *>(
+                    &this->d.t.nodes[this->d.t.free_idx]);
+            } else {
+                malloced = true;
+                XMALLOC_N(n.weight, tmp_array);
+            }
+            this->fill_array_with_subtree_idxs(tmp_array, *st);
+            this->rebuild_subtree_from_idxs(st, tmp_array, n.weight);
+            if (malloced)
+                toku_free(tmp_array);
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+        omtdata_t *const out,
+        const omt_node *const n) {
+        *out = n->value;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+        omtdata_t **const out,
+        omt_node *const n) {
+        *out = &n->value;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+        omtdata_t *const out,
+        const omtdata_t *const stored_value_ptr) {
+        *out = *stored_value_ptr;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+        omtdata_t **const out,
+        omtdata_t *const stored_value_ptr) {
+        *out = stored_value_ptr;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero_array(
+        const omtcmp_t &extra,
+        omtdataout_t *const value,
+        uint32_t *const idxp) const {
+        paranoid_invariant_notnull(idxp);
+        uint32_t min = this->d.a.start_idx;
+        uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+        uint32_t best_pos = subtree::NODE_NULL;
+        uint32_t best_zero = subtree::NODE_NULL;
+
+        while (min != limit) {
+            uint32_t mid = (min + limit) / 2;
+            int hv = h(this->d.a.values[mid], extra);
+            if (hv < 0) {
+                min = mid + 1;
+            } else if (hv > 0) {
+                best_pos = mid;
+                limit = mid;
+            } else {
+                best_zero = mid;
+                limit = mid;
+            }
+        }
+        if (best_zero != subtree::NODE_NULL) {
+            // Found a zero
+            if (value != nullptr) {
+                copyout(value, &this->d.a.values[best_zero]);
+            }
+            *idxp = best_zero - this->d.a.start_idx;
+            return 0;
+        }
+        if (best_pos != subtree::NODE_NULL)
+            *idxp = best_pos - this->d.a.start_idx;
+        else
+            *idxp = this->d.a.num_values;
+        return DB_NOTFOUND;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero(
+        const subtree &st,
+        const omtcmp_t &extra,
+        omtdataout_t *const value,
+        uint32_t *const idxp) const {
+        paranoid_invariant_notnull(idxp);
+        if (st.is_null()) {
+            *idxp = 0;
+            return DB_NOTFOUND;
+        }
+        omt_node &n = this->d.t.nodes[st.get_index()];
+        int hv = h(n.value, extra);
+        if (hv < 0) {
+            int r = this->find_internal_zero<omtcmp_t, h>(
+                n.right, extra, value, idxp);
+            *idxp += this->nweight(n.left) + 1;
+            return r;
+        } else if (hv > 0) {
+            return this->find_internal_zero<omtcmp_t, h>(
+                n.left, extra, value, idxp);
+        } else {
+            int r = this->find_internal_zero<omtcmp_t, h>(
+                n.left, extra, value, idxp);
+            if (r == DB_NOTFOUND) {
+                *idxp = this->nweight(n.left);
+                if (value != nullptr) {
+                    copyout(value, &n);
+                }
+                r = 0;
+            }
+            return r;
+        }
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus_array(
+        const omtcmp_t &extra,
+        omtdataout_t *const value,
+        uint32_t *const idxp) const {
+        paranoid_invariant_notnull(idxp);
+        uint32_t min = this->d.a.start_idx;
+        uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+        uint32_t best = subtree::NODE_NULL;
+
+        while (min != limit) {
+            const uint32_t mid = (min + limit) / 2;
+            const int hv = h(this->d.a.values[mid], extra);
+            if (hv > 0) {
+                best = mid;
+                limit = mid;
+            } else {
+                min = mid + 1;
+            }
+        }
+        if (best == subtree::NODE_NULL) {
+            return DB_NOTFOUND;
+        }
+        if (value != nullptr) {
+            copyout(value, &this->d.a.values[best]);
+        }
+        *idxp = best - this->d.a.start_idx;
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus(
+        const subtree &st,
+        const omtcmp_t &extra,
+        omtdataout_t *const value,
+        uint32_t *const idxp) const {
+        paranoid_invariant_notnull(idxp);
+        if (st.is_null()) {
+            return DB_NOTFOUND;
+        }
+        omt_node *const n = &this->d.t.nodes[st.get_index()];
+        int hv = h(n->value, extra);
+        int r;
+        if (hv > 0) {
+            r = this->find_internal_plus<omtcmp_t, h>(
+                n->left, extra, value, idxp);
+            if (r == DB_NOTFOUND) {
+                *idxp = this->nweight(n->left);
+                if (value != nullptr) {
+                    copyout(value, n);
+                }
+                r = 0;
+            }
+        } else {
+            r = this->find_internal_plus<omtcmp_t, h>(
+                n->right, extra, value, idxp);
+            if (r == 0) {
+                *idxp += this->nweight(n->left) + 1;
+            }
+        }
+        return r;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus_array(
+        const omtcmp_t &extra,
+        omtdataout_t *const value,
+        uint32_t *const idxp) const {
+        paranoid_invariant_notnull(idxp);
+        uint32_t min = this->d.a.start_idx;
+        uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+        uint32_t best = subtree::NODE_NULL;
+
+        while (min != limit) {
+            const uint32_t mid = (min + limit) / 2;
+            const int hv = h(this->d.a.values[mid], extra);
+            if (hv < 0) {
+                best = mid;
+                min = mid + 1;
+            } else {
+                limit = mid;
+            }
+        }
+        if (best == subtree::NODE_NULL) {
+            return DB_NOTFOUND;
+        }
+        if (value != nullptr) {
+            copyout(value, &this->d.a.values[best]);
+        }
+        *idxp = best - this->d.a.start_idx;
+        return 0;
+    }
+
+    template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+    template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus(
+        const subtree &st,
+        const omtcmp_t &extra,
+        omtdataout_t *const value,
+        uint32_t *const idxp) const {
+        paranoid_invariant_notnull(idxp);
+        if (st.is_null()) {
+            return DB_NOTFOUND;
+        }
+        omt_node *const n = &this->d.t.nodes[st.get_index()];
+        int hv = h(n->value, extra);
+        if (hv < 0) {
+            int r = this->find_internal_minus<omtcmp_t, h>(
+                n->right, extra, value, idxp);
+            if (r == 0) {
+                *idxp += this->nweight(n->left) + 1;
+            } else if (r == DB_NOTFOUND) {
+                *idxp = this->nweight(n->left);
+                if (value != nullptr) {
+                    copyout(value, n);
+                }
+                r = 0;
+            }
+            return r;
+        } else {
+            return this->find_internal_minus<omtcmp_t, h>(
+                n->left, extra, value, idxp);
+        }
+    }
+}  // namespace toku
diff --git a/storage/tokudb/PerconaFT/util/omt.h b/storage/tokudb/PerconaFT/util/omt.h
new file mode 100644
index 00000000..849389b9
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/omt.h
@@ -0,0 +1,773 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdint.h>
+#include <memory.h>
+#include <toku_portability.h>
+#include <toku_race_tools.h>
+#include "growable_array.h"
+
+namespace toku {
+
+/**
+ * Order Maintenance Tree (OMT)
+ *
+ * Maintains a collection of totally ordered values, where each value has an integer weight.
+ * The OMT is a mutable datatype.
+ *
+ * The Abstraction:
+ *
+ * An OMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+ * The vector is numbered from $0$ to $|V|-1$.
+ * Each value has a weight.  The weight of the $i$th element is denoted $w(V_i)$.
+ *
+ * We can create a new OMT, which is the empty vector.
+ *
+ * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+ *  $|V'|=1+|V|$       and
+ *
+ *   V'_j = V_j       if $j<i$
+ *          x         if $j=i$
+ *          V_{j-1}   if $j>i$.
+ *
+ * We can specify $i$ using a kind of function instead of as an integer.
+ * Let $b$ be a function mapping from values to nonzero integers, such that
+ * the signum of $b$ is monotically increasing.
+ * We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+ *
+ * We look up a value using its index, or using a Heaviside function.
+ * For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing.
+ * When lookup up values, we can look up
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a special return code if no such value exists.)
+ *      (Rationale:  Ordinarily we want $i$ to be unique.  But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.)
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
+ *  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an indication that no such value exists.)
+ *
+ * When looking up a value using a Heaviside function, we get the value and its index.
+ *
+ * We can also split an OMT into two OMTs, splitting the weight of the values evenly.
+ * Find a value $j$ such that the values to the left of $j$ have about the same total weight as the values to the right of $j$.
+ * The resulting two OMTs contain the values to the left of $j$ and the values to the right of $j$ respectively.
+ * All of the values from the original OMT go into one of the new OMTs.
+ * If the weights of the values don't split exactly evenly, then the implementation has the freedom to choose whether
+ *  the new left OMT or the new right OMT is larger.
+ *
+ * Performance:
+ *  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function.
+ *  The memory required is O(|V|).
+ *
+ * Usage:
+ *  The omt is templated by two parameters:
+ *   - omtdata_t is what will be stored within the omt.  These could be pointers or real data types (ints, structs).
+ *   - omtdataout_t is what will be returned by find and related functions.  By default, it is the same as omtdata_t, but you can set it to (omtdata_t *).
+ *  To create an omt which will store "TXNID"s, for example, it is a good idea to typedef the template:
+ *   typedef omt<TXNID> txnid_omt_t;
+ *  If you are storing structs, you may want to be able to get a pointer to the data actually stored in the omt (see find_zero).  To do this, use the second template parameter:
+ *   typedef omt<struct foo, struct foo *> foo_omt_t;
+ */
+
+namespace omt_internal {
+
+template<bool subtree_supports_marks>
+class subtree_templated {
+private:
+    uint32_t m_index;
+public:
+    static const uint32_t NODE_NULL = UINT32_MAX;
+    inline void set_to_null(void) {
+        m_index = NODE_NULL;
+    }
+
+    inline bool is_null(void) const {
+        return NODE_NULL == this->get_index();
+    }
+
+    inline uint32_t get_index(void) const {
+        return m_index;
+    }
+
+    inline void set_index(uint32_t index) {
+        paranoid_invariant(index != NODE_NULL);
+        m_index = index;
+    }
+} ;
+
+template<>
+class subtree_templated<true> {
+private:
+    uint32_t m_bitfield;
+    static const uint32_t MASK_INDEX = ~(((uint32_t)1) << 31);
+    static const uint32_t MASK_BIT = ((uint32_t)1) << 31;
+
+    inline void set_index_internal(uint32_t new_index) {
+        m_bitfield = (m_bitfield & MASK_BIT) | new_index;
+    }
+public:
+    static const uint32_t NODE_NULL = INT32_MAX;
+    inline void set_to_null(void) {
+        this->set_index_internal(NODE_NULL);
+    }
+
+    inline bool is_null(void) const {
+        return NODE_NULL == this->get_index();
+    }
+
+    inline uint32_t get_index(void) const {
+        TOKU_DRD_IGNORE_VAR(m_bitfield);
+        const uint32_t bits = m_bitfield;
+        TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+        return bits & MASK_INDEX;
+    }
+
+    inline void set_index(uint32_t index) {
+        paranoid_invariant(index < NODE_NULL);
+        this->set_index_internal(index);
+    }
+
+    inline bool get_bit(void) const {
+        TOKU_DRD_IGNORE_VAR(m_bitfield);
+        const uint32_t bits = m_bitfield;
+        TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+        return (bits & MASK_BIT) != 0;
+    }
+
+    inline void enable_bit(void) {
+        // These bits may be set by a thread with a write lock on some
+        // leaf, and the index can be read by another thread with a (read
+        // or write) lock on another thread.  Also, the has_marks_below
+        // bit can be set by two threads simultaneously.  Neither of these
+        // are real races, so if we are using DRD we should tell it to
+        // ignore these bits just while we set this bit.  If there were a
+        // race in setting the index, that would be a real race.
+        TOKU_DRD_IGNORE_VAR(m_bitfield);
+        m_bitfield |= MASK_BIT;
+        TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+    }
+
+    inline void disable_bit(void) {
+        m_bitfield &= MASK_INDEX;
+    }
+} ;
+
+template<typename omtdata_t, bool subtree_supports_marks>
+class omt_node_templated {
+public:
+    uint32_t weight;
+    subtree_templated<subtree_supports_marks> left;
+    subtree_templated<subtree_supports_marks> right;
+    omtdata_t value;
+
+    // this needs to be in both implementations because we don't have
+    // a "static if" the caller can use
+    inline void clear_stolen_bits(void) {}
+} ;
+
+template<typename omtdata_t>
+class omt_node_templated<omtdata_t, true> {
+public:
+    uint32_t weight;
+    subtree_templated<true> left;
+    subtree_templated<true> right;
+    omtdata_t value;
+    inline bool get_marked(void) const {
+        return left.get_bit();
+    }
+    inline void set_marked_bit(void) {
+        return left.enable_bit();
+    }
+    inline void unset_marked_bit(void) {
+        return left.disable_bit();
+    }
+
+    inline bool get_marks_below(void) const {
+        return right.get_bit();
+    }
+    inline void set_marks_below_bit(void) {
+        // This function can be called by multiple threads.
+        // Checking first reduces cache invalidation.
+        if (!this->get_marks_below()) {
+            right.enable_bit();
+        }
+    }
+    inline void unset_marks_below_bit(void) {
+        right.disable_bit();
+    }
+
+    inline void clear_stolen_bits(void) {
+        this->unset_marked_bit();
+        this->unset_marks_below_bit();
+    }
+} ;
+
+}
+
+template<typename omtdata_t,
+         typename omtdataout_t=omtdata_t,
+         bool supports_marks=false>
+class omt {
+public:
+    /**
+     * Effect: Create an empty OMT.
+     * Performance: constant time.
+     */
+    void create(void);
+
+    /**
+     * Effect: Create an empty OMT with no internal allocated space.
+     * Performance: constant time.
+     * Rationale: In some cases we need a valid omt but don't want to malloc.
+     */
+    void create_no_array(void);
+
+    /**
+     * Effect: Create a OMT containing values.  The number of values is in numvalues.
+     *  Stores the new OMT in *omtp.
+     * Requires: this has not been created yet
+     * Requires: values != NULL
+     * Requires: values is sorted
+     * Performance:  time=O(numvalues)
+     * Rationale:    Normally to insert N values takes O(N lg N) amortized time.
+     *               If the N values are known in advance, are sorted, and
+     *               the structure is empty, we can batch insert them much faster.
+     */
+    __attribute__((nonnull))
+    void create_from_sorted_array(const omtdata_t *const values, const uint32_t numvalues);
+
+    /**
+     * Effect: Create an OMT containing values.  The number of values is in numvalues.
+     *         On success the OMT takes ownership of *values array, and sets values=NULL.
+     * Requires: this has not been created yet
+     * Requires: values != NULL
+     * Requires: *values is sorted
+     * Requires: *values was allocated with toku_malloc
+     * Requires: Capacity of the *values array is <= new_capacity
+     * Requires: On success, *values may not be accessed again by the caller.
+     * Performance:  time=O(1)
+     * Rational:     create_from_sorted_array takes O(numvalues) time.
+     *               By taking ownership of the array, we save a malloc and memcpy,
+     *               and possibly a free (if the caller is done with the array).
+     */
+    void create_steal_sorted_array(omtdata_t **const values, const uint32_t numvalues, const uint32_t new_capacity);
+
+    /**
+     * Effect: Create a new OMT, storing it in *newomt.
+     *  The values to the right of index (starting at index) are moved to *newomt.
+     * Requires: newomt != NULL
+     * Returns
+     *    0             success,
+     *    EINVAL        if index > toku_omt_size(omt)
+     * On nonzero return, omt and *newomt are unmodified.
+     * Performance: time=O(n)
+     * Rationale:  We don't need a split-evenly operation.  We need to split items so that their total sizes
+     *  are even, and other similar splitting criteria.  It's easy to split evenly by calling size(), and dividing by two.
+     */
+    __attribute__((nonnull))
+    int split_at(omt *const newomt, const uint32_t idx);
+
+    /**
+     * Effect: Appends leftomt and rightomt to produce a new omt.
+     *  Creates this as the new omt.
+     *  leftomt and rightomt are destroyed.
+     * Performance: time=O(n) is acceptable, but one can imagine implementations that are O(\log n) worst-case.
+     */
+    __attribute__((nonnull))
+    void merge(omt *const leftomt, omt *const rightomt);
+
+    /**
+     * Effect: Creates a copy of an omt.
+     *  Creates this as the clone.
+     *  Each element is copied directly.  If they are pointers, the underlying data is not duplicated.
+     * Performance: O(n) or the running time of fill_array_with_subtree_values()
+     */
+    void clone(const omt &src);
+
+    /**
+     * Effect: Set the tree to be empty.
+     *  Note: Will not reallocate or resize any memory.
+     * Performance: time=O(1)
+     */
+    void clear(void);
+
+    /**
+     * Effect:  Destroy an OMT, freeing all its memory.
+     *   If the values being stored are pointers, their underlying data is not freed.  See free_items()
+     *   Those values may be freed before or after calling toku_omt_destroy.
+     * Rationale: Returns no values since free() cannot fail.
+     * Rationale: Does not free the underlying pointers to reduce complexity.
+     * Performance:  time=O(1)
+     */
+    void destroy(void);
+
+    /**
+     * Effect: return |this|.
+     * Performance:  time=O(1)
+     */
+    uint32_t size(void) const;
+    
+
+    /**
+     * Effect:  Insert value into the OMT.
+     *   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+     *   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+     *      If no such i exists, then let i be |V|
+     *   Then this has the same effect as
+     *    insert_at(tree, value, i);
+     *   If idx!=NULL then i is stored in *idx
+     * Requires:  The signum of h must be monotonically increasing.
+     * Returns:
+     *    0            success
+     *    DB_KEYEXIST  the key is present (h was equal to zero for some value)
+     * On nonzero return, omt is unchanged.
+     * Performance: time=O(\log N) amortized.
+     * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+     */
+    template<typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int insert(const omtdata_t &value, const omtcmp_t &v, uint32_t *const idx);
+
+    /**
+     * Effect: Increases indexes of all items at slot >= idx by 1.
+     *         Insert value into the position at idx.
+     * Returns:
+     *   0         success
+     *   EINVAL    if idx > this->size()
+     * On error, omt is unchanged.
+     * Performance: time=O(\log N) amortized time.
+     * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+     */
+    int insert_at(const omtdata_t &value, const uint32_t idx);
+
+    /**
+     * Effect:  Replaces the item at idx with value.
+     * Returns:
+     *   0       success
+     *   EINVAL  if idx>=this->size()
+     * On error, omt is unchanged.
+     * Performance: time=O(\log N)
+     * Rationale: The FT needs to be able to replace a value with another copy of the same value (allocated in a different location)
+     * 
+     */
+    int set_at(const omtdata_t &value, const uint32_t idx);
+
+    /**
+     * Effect: Delete the item in slot idx.
+     *         Decreases indexes of all items at slot > idx by 1.
+     * Returns
+     *     0            success
+     *     EINVAL       if idx>=this->size()
+     * On error, omt is unchanged.
+     * Rationale: To delete an item, first find its index using find or find_zero, then delete it.
+     * Performance: time=O(\log N) amortized.
+     */
+    int delete_at(const uint32_t idx);
+
+    /**
+     * Effect:  Iterate over the values of the omt, from left to right, calling f on each value.
+     *  The first argument passed to f is a ref-to-const of the value stored in the omt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     * Requires: f != NULL
+     * Returns:
+     *  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate.
+     *  If f always returns zero, then iterate returns 0.
+     * Requires:  Don't modify the omt while running.  (E.g., f may not insert or delete values from the omt.)
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the omt.
+     * Rationale: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+     * Rationale: We may at some point use functors, but for now this is a smaller change from the old OMT.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate(iterate_extra_t *const iterate_extra) const;
+
+    /**
+     * Effect:  Iterate over the values of the omt, from left to right, calling f on each value.
+     *  The first argument passed to f is a ref-to-const of the value stored in the omt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     *  We will iterate only over [left,right)
+     *
+     * Requires: left <= right
+     * Requires: f != NULL
+     * Returns:
+     *  EINVAL  if right > this->size()
+     *  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate_on_range.
+     *  If f always returns zero, then iterate_on_range returns 0.
+     * Requires:  Don't modify the omt while running.  (E.g., f may not insert or delete values from the omt.)
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the omt.
+     * Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const;
+
+    /**
+     * Effect: Iterate over the values of the omt, and mark the nodes that are visited.
+     *  Other than the marks, this behaves the same as iterate_on_range.
+     * Requires: supports_marks == true
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the omt.
+     * Notes:
+     *  This function MAY be called concurrently by multiple threads, but
+     *  not concurrently with any other non-const function.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_and_mark_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra);
+
+    /**
+     * Effect: Iterate over the values of the omt, from left to right, calling f on each value whose node has been marked.
+     *  Other than the marks, this behaves the same as iterate.
+     * Requires: supports_marks == true
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the omt.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_over_marked(iterate_extra_t *const iterate_extra) const;
+
+    /**
+     * Effect: Delete all elements from the omt, whose nodes have been marked.
+     * Requires: supports_marks == true
+     * Performance: time=O(N + i\log N) where i is the number of marked elements, {c,sh}ould be faster
+     */
+    void delete_all_marked(void);
+
+    /**
+     * Effect: Verify that the internal state of the marks in the tree are self-consistent.
+     *  Crashes the system if the marks are in a bad state.
+     * Requires: supports_marks == true
+     * Performance: time=O(N)
+     * Notes:
+     *  Even though this is a const function, it requires exclusive access.
+     * Rationale:
+     *  The current implementation of the marks relies on a sort of
+     *  "cache" bit representing the state of bits below it in the tree.
+     *  This allows glass-box testing that these bits are correct.
+     */
+    void verify_marks_consistent(void) const;
+
+    /**
+     * Effect: None
+     * Returns whether there are any marks in the tree.
+     */
+    bool has_marks(void) const;
+
+    /**
+     * Effect:  Iterate over the values of the omt, from left to right, calling f on each value.
+     *  The first argument passed to f is a pointer to the value stored in the omt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     * Requires: same as for iterate()
+     * Returns: same as for iterate()
+     * Performance: same as for iterate()
+     * Rationale: In general, most iterators should use iterate() since they should not modify the data stored in the omt.  This function is for iterators which need to modify values (for example, free_items).
+     * Rationale: We assume if you are transforming the data in place, you want to do it to everything at once, so there is not yet an iterate_on_range_ptr (but there could be).
+     */
+    template<typename iterate_extra_t,
+             int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr(iterate_extra_t *const iterate_extra);
+
+    /**
+     * Effect: Set *value=V_idx
+     * Returns
+     *    0             success
+     *    EINVAL        if index>=toku_omt_size(omt)
+     * On nonzero return, *value is unchanged
+     * Performance: time=O(\log N)
+     */
+    int fetch(const uint32_t idx, omtdataout_t *const value) const;
+
+    /**
+     * Effect:  Find the smallest i such that h(V_i, extra)>=0
+     *  If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value = V_i, and return 0.
+     *  If there is such an i and h(V_i,extra)>0  then set *idxp=i and return DB_NOTFOUND.
+     *  If there is no such i then set *idx=this->size() and return DB_NOTFOUND.
+     * Note: value is of type omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is fixed by the instantiation.
+     *  If it is the value type, then the value is copied out (even if the value type is a pointer to something else)
+     *  If it is the pointer type, then *value is set to a pointer to the data within the omt.
+     *  This is determined by the type of the omt as initially declared.
+     *   If the omt is declared as omt<foo_t>, then foo_t's will be stored and foo_t's will be returned by find and related functions.
+     *   If the omt is declared as omt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the stored items will be returned by find and related functions.
+     * Rationale:
+     *  Structs too small for malloc should be stored directly in the omt.
+     *  These structs may need to be edited as they exist inside the omt, so we need a way to get a pointer within the omt.
+     *  Using separate functions for returning pointers and values increases code duplication and reduces type-checking.
+     *  That also reduces the ability of the creator of a data structure to give advice to its future users.
+     *  Slight overloading in this case seemed to provide a better API and better type checking.
+     */
+    template<typename omtcmp_t,
+             int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int find_zero(const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const;
+
+    /**
+     *   Effect:
+     *    If direction >0 then find the smallest i such that h(V_i,extra)>0.
+     *    If direction <0 then find the largest  i such that h(V_i,extra)<0.
+     *    (Direction may not be equal to zero.)
+     *    If value!=NULL then store V_i in *value
+     *    If idxp!=NULL then store i in *idxp.
+     *   Requires: The signum of h is monotically increasing.
+     *   Returns
+     *      0             success
+     *      DB_NOTFOUND   no such value is found.
+     *   On nonzero return, *value and *idxp are unchanged
+     *   Performance: time=O(\log N)
+     *   Rationale:
+     *     Here's how to use the find function to find various things
+     *       Cases for find:
+     *        find first value:         ( h(v)=+1, direction=+1 )
+     *        find last value           ( h(v)=-1, direction=-1 )
+     *        find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
+     *        find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
+     *        find X or successor to X  ( same as find first X. )
+     *
+     *   Rationale: To help understand heaviside functions and behavor of find:
+     *    There are 7 kinds of heaviside functions.
+     *    The signus of the h must be monotonically increasing.
+     *    Given a function of the following form, A is the element
+     *    returned for direction>0, B is the element returned
+     *    for direction<0, C is the element returned for
+     *    direction==0 (see find_zero) (with a return of 0), and D is the element
+     *    returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+     *    If any of A, B, or C are not found, then asking for the
+     *    associated direction will return DB_NOTFOUND.
+     *    See find_zero for more information.
+     *
+     *    Let the following represent the signus of the heaviside function.
+     *
+     *    -...-
+     *        A
+     *         D
+     *
+     *    +...+
+     *    B
+     *    D
+     *
+     *    0...0
+     *    C
+     *
+     *    -...-0...0
+     *        AC
+     *
+     *    0...0+...+
+     *    C    B
+     *
+     *    -...-+...+
+     *        AB
+     *         D
+     *
+     *    -...-0...0+...+
+     *        AC    B
+     */
+    template<typename omtcmp_t,
+             int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int find(const omtcmp_t &extra, int direction, omtdataout_t *const value, uint32_t *const idxp) const;
+
+    /**
+     * Effect: Return the size (in bytes) of the omt, as it resides in main memory.  If the data stored are pointers, don't include the size of what they all point to.
+     */
+    size_t memory_size(void);
+
+private:
+    typedef uint32_t node_idx;
+    typedef omt_internal::subtree_templated<supports_marks> subtree;
+    typedef omt_internal::omt_node_templated<omtdata_t, supports_marks> omt_node;
+    ENSURE_POD(subtree);
+
+    struct omt_array {
+        uint32_t start_idx;
+        uint32_t num_values;
+        omtdata_t *values;
+    };
+
+    struct omt_tree {
+        subtree root;
+        uint32_t free_idx;
+        omt_node *nodes;
+    };
+
+    bool is_array;
+    uint32_t capacity;
+    union {
+        struct omt_array a;
+        struct omt_tree t;
+    } d;
+
+    __attribute__((nonnull))
+    void unmark(const subtree &subtree, const uint32_t index, GrowableArray<node_idx> *const indexes);
+
+    void create_internal_no_array(const uint32_t new_capacity);
+
+    void create_internal(const uint32_t new_capacity);
+
+    uint32_t nweight(const subtree &subtree) const;
+
+    node_idx node_malloc(void);
+
+    void node_free(const node_idx idx);
+
+    void maybe_resize_array(const uint32_t n);
+
+    __attribute__((nonnull))
+    void fill_array_with_subtree_values(omtdata_t *const array, const subtree &subtree) const;
+
+    void convert_to_array(void);
+
+    __attribute__((nonnull))
+    void rebuild_from_sorted_array(subtree *const subtree, const omtdata_t *const values, const uint32_t numvalues);
+
+    void convert_to_tree(void);
+
+    void maybe_resize_or_convert(const uint32_t n);
+
+    bool will_need_rebalance(const subtree &subtree, const int leftmod, const int rightmod) const;
+
+    __attribute__((nonnull))
+    void insert_internal(subtree *const subtreep, const omtdata_t &value, const uint32_t idx, subtree **const rebalance_subtree);
+
+    void set_at_internal_array(const omtdata_t &value, const uint32_t idx);
+
+    void set_at_internal(const subtree &subtree, const omtdata_t &value, const uint32_t idx);
+
+    void delete_internal(subtree *const subtreep, const uint32_t idx, omt_node *const copyn, subtree **const rebalance_subtree);
+
+    template<typename iterate_extra_t,
+             int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_internal_array(const uint32_t left, const uint32_t right,
+                                      iterate_extra_t *const iterate_extra) const;
+
+    template<typename iterate_extra_t,
+             int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr_internal(const uint32_t left, const uint32_t right,
+                                     const subtree &subtree, const uint32_t idx,
+                                     iterate_extra_t *const iterate_extra);
+
+    template<typename iterate_extra_t,
+             int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+                                           iterate_extra_t *const iterate_extra);
+
+    template<typename iterate_extra_t,
+             int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_internal(const uint32_t left, const uint32_t right,
+                                const subtree &subtree, const uint32_t idx,
+                                iterate_extra_t *const iterate_extra) const;
+
+    template<typename iterate_extra_t,
+             int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
+                                        const subtree &subtree, const uint32_t idx,
+                                        iterate_extra_t *const iterate_extra);
+
+    template<typename iterate_extra_t,
+             int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_over_marked_internal(const subtree &subtree, const uint32_t idx,
+                                     iterate_extra_t *const iterate_extra) const;
+
+    uint32_t verify_marks_consistent_internal(const subtree &subtree, const bool allow_marks) const;
+
+    void fetch_internal_array(const uint32_t i, omtdataout_t *const value) const;
+
+    void fetch_internal(const subtree &subtree, const uint32_t i, omtdataout_t *const value) const;
+
+    __attribute__((nonnull))
+    void fill_array_with_subtree_idxs(node_idx *const array, const subtree &subtree) const;
+
+    __attribute__((nonnull))
+    void rebuild_subtree_from_idxs(subtree *const subtree, const node_idx *const idxs, const uint32_t numvalues);
+
+    __attribute__((nonnull))
+    void rebalance(subtree *const subtree);
+
+    __attribute__((nonnull))
+    static void copyout(omtdata_t *const out, const omt_node *const n);
+
+    __attribute__((nonnull))
+    static void copyout(omtdata_t **const out, omt_node *const n);
+
+    __attribute__((nonnull))
+    static void copyout(omtdata_t *const out, const omtdata_t *const stored_value_ptr);
+
+    __attribute__((nonnull))
+    static void copyout(omtdata_t **const out, omtdata_t *const stored_value_ptr);
+
+    template<typename omtcmp_t,
+             int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename omtcmp_t,
+             int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int find_internal_zero(const subtree &subtree, const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename omtcmp_t,
+             int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename omtcmp_t,
+             int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int find_internal_plus(const subtree &subtree, const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename omtcmp_t,
+             int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int find_internal_minus_array(const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename omtcmp_t,
+             int (*h)(const omtdata_t &, const omtcmp_t &)>
+    int find_internal_minus(const subtree &subtree, const omtcmp_t &extra, omtdataout_t *const value, uint32_t *const idxp) const;
+};
+
+} // namespace toku
+
+// include the implementation here
+#include "omt.cc"
diff --git a/storage/tokudb/PerconaFT/util/partitioned_counter.cc b/storage/tokudb/PerconaFT/util/partitioned_counter.cc
new file mode 100644
index 00000000..7a6b8ab2
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/partitioned_counter.cc
@@ -0,0 +1,417 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <toku_race_tools.h>
+#include <sys/types.h>
+#include <pthread.h>
+
+#include "memory.h"
+#include "partitioned_counter.h"
+#include "doubly_linked_list.h"
+#include "growable_array.h"
+#include <portability/toku_atomic.h>
+
+#ifdef __APPLE__
+// TODO(leif): The __thread declspec is broken in ways I don't understand
+// on Darwin.  Partitioned counters use them and it would be prohibitive
+// to tease them apart before a week after 6.5.0, so instead, we're just
+// not going to use them in the most brutal way possible.  This is a
+// terrible implementation of the API in partitioned_counter.h but it
+// should be correct enough to release a non-performant version on OSX for
+// development.  Soon, we need to either make portable partitioned
+// counters, or we need to do this disabling in a portable way.
+
+struct partitioned_counter {
+    uint64_t v;
+};
+
+PARTITIONED_COUNTER create_partitioned_counter(void) {
+    PARTITIONED_COUNTER XCALLOC(counter);
+    return counter;
+}
+
+void destroy_partitioned_counter(PARTITIONED_COUNTER counter) {
+    toku_free(counter);
+}
+
+void increment_partitioned_counter(PARTITIONED_COUNTER counter, uint64_t delta) {
+    (void) toku_sync_fetch_and_add(&counter->v, delta);
+}
+
+uint64_t read_partitioned_counter(PARTITIONED_COUNTER counter) {
+    return counter->v;
+}
+
+void partitioned_counters_init(void) {}
+void partitioned_counters_destroy(void) {}
+
+#else // __APPLE__
+
+//******************************************************************************
+//
+// Representation: The representation of a partitioned counter comprises a
+//  sum, called sum_of_dead; an index, called the ckey, which indexes into a
+//  thread-local array to find a thread-local part of the counter; and a
+//  linked list of thread-local parts.
+//
+//  There is also a linked list, for each thread that has a thread-local part
+//  of any counter, of all the thread-local parts of all the counters.
+//
+//  There is a pthread_key which gives us a hook to clean up thread-local
+//  state when a thread terminates.  For each thread-local part of a counter
+//  that the thread has, we add in the thread-local sum into the sum_of_dead.
+//
+//  Finally there is a list of all the thread-local arrays so that when we
+//  destroy the partitioned counter before the threads are done, we can find
+//  and destroy the thread_local_arrays before destroying the pthread_key.
+//
+// Abstraction function: The sum is represented by the sum of _sum and the
+//  sum's of the thread-local parts of the counter.
+//
+// Representation invariant: Every thread-local part is in the linked list of
+//  the thread-local parts of its counter, as well as in the linked list of
+//  the counters of a the thread.
+//
+//******************************************************************************
+
+//******************************************************************************
+// The mutex for the PARTITIONED_COUNTER
+// We have a single mutex for all the counters because
+//  (a) the mutex is obtained infrequently, and
+//  (b) it helps us avoid race conditions when destroying the counters.
+// The alternative that I couldn't make work is to have a mutex per counter.
+//   But the problem is that the counter can be destroyed before threads
+//   terminate, or maybe a thread terminates before the counter is destroyed.
+//   If the counter is destroyed first, then the mutex is no longer available.
+//******************************************************************************
+
+using namespace toku;
+
+static pthread_mutex_t partitioned_counter_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void pc_lock (void)
+// Effect: Lock the mutex.
+{
+    int r = pthread_mutex_lock(&partitioned_counter_mutex);
+    assert(r==0);
+}
+
+static void pc_unlock (void)
+// Effect: Unlock the mutex.
+{
+    int r = pthread_mutex_unlock(&partitioned_counter_mutex);
+    assert(r==0);
+}
+
+//******************************************************************************
+// Key creation primitives
+//******************************************************************************
+static void pk_create (pthread_key_t *key, void (*destructor)(void*)) {
+    int r = pthread_key_create(key, destructor);
+    assert(r==0);
+}
+
+static void pk_delete (pthread_key_t key) {
+    int r = pthread_key_delete(key);
+    assert(r==0);
+}
+
+static void pk_setspecific (pthread_key_t key, const void *value) {
+    int r = pthread_setspecific(key, value);
+    assert(r==0);
+}
+
+//******************************************************************************
+// The counter itself.
+// The thread local part of a counter, comprising the thread-local sum a pointer
+//  to the partitioned_counter, a pointer to the thread_local list head, and two
+//  linked lists. One of the lists is all the thread-local parts that belong to
+//  the same counter, and the other is all the thread-local parts that belogn to
+//  the same thread.
+//******************************************************************************
+
+struct local_counter;
+
+struct partitioned_counter {
+    uint64_t       sum_of_dead;                                     // The sum of all thread-local counts from threads that have terminated.
+    uint64_t       pc_key;                                          // A unique integer among all counters that have been created but not yet destroyed.
+    DoublyLinkedList<struct local_counter *> ll_counter_head; // A linked list of all the thread-local information for this counter.
+};
+
+struct local_counter {
+    uint64_t                                   sum;                // The thread-local sum.
+    PARTITIONED_COUNTER                        owner_pc;           // The partitioned counter that this is part of.
+    GrowableArray<struct local_counter *>     *thread_local_array; // The thread local array for this thread holds this local_counter at offset owner_pc->pc_key.
+    LinkedListElement<struct local_counter *>  ll_in_counter;      // Element for the doubly-linked list of thread-local information for this PARTITIONED_COUNTER.
+};
+
+// Try to get it it into one cache line by aligning it.
+static __thread GrowableArray<struct local_counter *> thread_local_array;
+static __thread bool                                  thread_local_array_inited = false;
+
+static DoublyLinkedList<GrowableArray<struct local_counter *> *> all_thread_local_arrays;
+static __thread LinkedListElement<GrowableArray<struct local_counter *> *> thread_local_ll_elt;
+
+static void destroy_thread_local_part_of_partitioned_counters (void *ignore_me);
+static void destroy_thread_local_part_of_partitioned_counters (void *ignore_me __attribute__((__unused__)))
+// Effect: This function is called whenever a thread terminates using the
+//  destructor of the thread_destructor_key (defined below).  First grab the
+//  lock, then go through all the partitioned counters and removes the part that
+//  is local to this thread.  We don't actually need the contents of the
+//  thread_destructor_key except to cause this function to run.  The content of
+//  the key is a static string, so don't try to free it.
+{
+    pc_lock();
+    for (size_t i=0; i<thread_local_array.get_size(); i++) {
+        struct local_counter *lc = thread_local_array.fetch_unchecked(i);
+        if (lc==NULL) continue;
+        PARTITIONED_COUNTER owner = lc->owner_pc;
+        owner->sum_of_dead += lc->sum;
+        owner->ll_counter_head.remove(&lc->ll_in_counter);
+        toku_free(lc);
+    }
+    all_thread_local_arrays.remove(&thread_local_ll_elt);
+    thread_local_array_inited = false;
+    thread_local_array.deinit();
+    pc_unlock();
+}
+
+//******************************************************************************
+// We employ a system-wide pthread_key simply to get a notification when a
+//  thread terminates. The key will simply contain a constant string (it's "dont
+//  care", but it doesn't matter what it is, as long as it's not NULL.  We need
+//  a constructor function to set up the pthread_key.  We used a constructor
+//  function intead of a C++ constructor because that's what we are used to,
+//  rather than because it's necessarily better.  Whenever a thread tries to
+//  increment a partitioned_counter for the first time, it sets the
+//  pthread_setspecific for the thread_destructor_key.  It's OK if the key gets
+//  setspecific multiple times, it's always the same value.  When a thread (that
+//  has created a thread-local part of any partitioned counter) terminates, the
+//  destroy_thread_local_part_of_partitioned_counters will run.  It may run
+//  before or after other pthread_key destructors, but the thread-local
+//  ll_thread_head variable is still present until the thread is completely done
+//  running.
+//******************************************************************************
+
+static pthread_key_t thread_destructor_key;
+
+//******************************************************************************
+// We don't like using up pthread_keys (macos provides only 128 of them),
+// so we built our own.   Also, looking at the source code for linux libc,
+// it looks like pthread_keys get slower if there are a lot of them.
+// So we use only one pthread_key.
+//******************************************************************************
+
+GrowableArray<bool> counters_in_use;
+
+static uint64_t allocate_counter (void)
+// Effect: Find an unused counter number, and allocate it, returning the counter number.
+//  Grabs the pc_lock.
+{
+    uint64_t ret;
+    pc_lock();
+    size_t size = counters_in_use.get_size();
+    for (uint64_t i=0; i<size; i++) {
+        if (!counters_in_use.fetch_unchecked(i)) {
+            counters_in_use.store_unchecked(i, true);
+            ret = i;
+            goto unlock;
+        }
+    }
+    counters_in_use.push(true);
+    ret = size;
+unlock:
+    pc_unlock();
+    return ret;
+}
+
+
+static void free_counter(uint64_t counternum)
+// Effect: Free a counter.
+// Requires: The pc mutex is held before calling.
+{
+    assert(counternum < counters_in_use.get_size());
+    assert(counters_in_use.fetch_unchecked(counternum));
+    counters_in_use.store_unchecked(counternum, false);
+}
+
+static void destroy_counters (void) {
+    counters_in_use.deinit();
+}
+
+
+//******************************************************************************
+// Now for the code that actually creates a counter.
+//******************************************************************************
+
+PARTITIONED_COUNTER create_partitioned_counter(void)
+// Effect: Create a counter, initialized to zero.
+{
+    PARTITIONED_COUNTER XMALLOC(result);
+    result->sum_of_dead = 0;
+    result->pc_key = allocate_counter();
+    result->ll_counter_head.init();
+    return result;
+}
+
+void destroy_partitioned_counter(PARTITIONED_COUNTER pc)
+// Effect: Destroy the counter.  No operations on this counter are permitted after.
+// Implementation note: Since we have a global lock, we can destroy all the thread-local
+//  versions as well.
+{
+    pc_lock();
+    uint64_t pc_key = pc->pc_key;
+    LinkedListElement<struct local_counter *> *first;
+    while (pc->ll_counter_head.pop(&first)) {
+        // We just removed first from the counter list, now we must remove it from the thread-local array.
+        struct local_counter *lc = first->get_container();
+        assert(pc == lc->owner_pc);
+        GrowableArray<struct local_counter *> *tla = lc->thread_local_array;
+        tla->store_unchecked(pc_key, NULL);
+        toku_free(lc);
+    }
+    toku_free(pc);
+    free_counter(pc_key);
+    pc_unlock();
+}
+
+static inline struct local_counter *get_thread_local_counter(uint64_t pc_key, GrowableArray<struct local_counter *> *a)
+{
+    if (pc_key >= a->get_size()) {
+        return NULL;
+    } else {
+        return a->fetch_unchecked(pc_key);
+    }
+}
+
+static struct local_counter *get_or_alloc_thread_local_counter(PARTITIONED_COUNTER pc)
+{
+    // Only this thread is allowed to modify thread_local_array, except for setting tla->array[pc_key] to NULL
+    // when a counter is destroyed (and in that case there should be no race because no other thread should be
+    // trying to access the same local counter at the same time.
+    uint64_t pc_key = pc->pc_key;
+    struct local_counter *lc = get_thread_local_counter(pc->pc_key, &thread_local_array);
+    if (lc == NULL) {
+        XMALLOC(lc);    // Might as well do the malloc without holding the pc lock.  But most of the rest of this work needs the lock.
+        pc_lock();
+
+        // Set things up so that this thread terminates, the thread-local parts of the counter will be destroyed and merged into their respective counters.
+        if (!thread_local_array_inited) {
+            pk_setspecific(thread_destructor_key, "dont care");
+            thread_local_array_inited=true;
+            thread_local_array.init();
+            all_thread_local_arrays.insert(&thread_local_ll_elt, &thread_local_array);
+        }
+
+        lc->sum         = 0;
+        TOKU_VALGRIND_HG_DISABLE_CHECKING(&lc->sum, sizeof(lc->sum)); // the counter increment is kind of racy.
+        lc->owner_pc    = pc;
+        lc->thread_local_array = &thread_local_array;
+
+        // Grow the array if needed, filling in NULLs
+        while (thread_local_array.get_size() <= pc_key) {
+            thread_local_array.push(NULL);
+        }
+        thread_local_array.store_unchecked(pc_key, lc);
+        pc->ll_counter_head.insert(&lc->ll_in_counter, lc);
+        pc_unlock();
+    }
+    return lc;
+}
+
+void increment_partitioned_counter(PARTITIONED_COUNTER pc, uint64_t amount)
+// Effect: Increment the counter by amount.
+// Requires: No overflows.  This is a 64-bit unsigned counter.
+{
+    struct local_counter *lc = get_or_alloc_thread_local_counter(pc);
+    lc->sum += amount;
+}
+
+static int sumit(struct local_counter *lc, uint64_t *sum) {
+    (*sum)+=lc->sum;
+    return 0;
+}
+
+uint64_t read_partitioned_counter(PARTITIONED_COUNTER pc)
+// Effect: Return the current value of the counter.
+// Implementation note: Sum all the thread-local counts along with the sum_of_the_dead.
+{
+    pc_lock();
+    uint64_t sum = pc->sum_of_dead;
+    int r = pc->ll_counter_head.iterate<uint64_t *>(sumit, &sum);
+    assert(r==0);
+    pc_unlock();
+    return sum;
+}
+
+void partitioned_counters_init(void)
+// Effect: Initialize any partitioned counters data structures that must be set up before any partitioned counters run.
+{
+    pk_create(&thread_destructor_key, destroy_thread_local_part_of_partitioned_counters);
+    all_thread_local_arrays.init();
+}
+
+void partitioned_counters_destroy(void)
+// Effect: Destroy any partitioned counters data structures.
+{
+    pc_lock();
+    LinkedListElement<GrowableArray<struct local_counter *> *> *a_ll;
+    while (all_thread_local_arrays.pop(&a_ll)) {
+        a_ll->get_container()->deinit();
+    }
+
+    pk_delete(thread_destructor_key);
+    destroy_counters();
+    pc_unlock();
+}
+
+#endif // __APPLE__
diff --git a/storage/tokudb/PerconaFT/util/partitioned_counter.h b/storage/tokudb/PerconaFT/util/partitioned_counter.h
new file mode 100644
index 00000000..d5bf97cf
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/partitioned_counter.h
@@ -0,0 +1,149 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// Overview: A partitioned_counter provides a counter that can be incremented and the running sum can be read at any time.
+//  We assume that increments are frequent, whereas reading is infrequent.
+// Implementation hint: Use thread-local storage so each thread increments its own data.  The increment does not require a lock or atomic operation.
+//  Reading the data can be performed by iterating over the thread-local versions, summing them up.
+//  The data structure also includes a sum for all the threads that have died.
+//  Use a pthread_key to create the thread-local versions.  When a thread finishes, the system calls pthread_key destructor which can add that thread's copy
+//  into the sum_of_dead counter.
+// Rationale: For statistics such as are found in engine status, we need a counter that requires no cache misses to increment.  We've seen significant
+//  performance speedups by removing certain counters.  Rather than removing those statistics, we would like to just make the counter fast.
+//  We generally increment the counters frequently, and want to fetch the values infrequently.
+//  The counters are monotonic.
+//  The counters can be split into many counters, which can be summed up at the end.
+//  We don't care if we get slightly out-of-date counter sums when we read the counter.  We don't care if there is a race on reading the a counter
+//   variable and incrementing.
+//  See tests/test_partitioned_counter.c for some performance measurements.
+// Operations:
+//   create_partitioned_counter    Create a counter initialized to zero.
+//   destroy_partitioned_counter   Destroy it.
+//   increment_partitioned_counter Increment it.  This is the frequent operation.
+//   read_partitioned_counter      Get the current value.  This is infrequent.
+// See partitioned_counter.cc for the abstraction function and representation invariant.
+//
+// The google style guide says to avoid using constructors, and it appears that
+// constructors may have broken all the tests, because they called
+// pthread_key_create before the key was actually created.  So the google style
+// guide may have some wisdom there...
+//
+// This version does not use constructors, essentially reverrting to the google C++ style guide.
+//
+
+// The old C interface.  This required a bunch of explicit ___attribute__((__destructor__)) functions to remember to destroy counters at the end.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct partitioned_counter *PARTITIONED_COUNTER;
+PARTITIONED_COUNTER create_partitioned_counter(void);
+// Effect: Create a counter, initialized to zero.
+
+void destroy_partitioned_counter(PARTITIONED_COUNTER);
+// Effect: Destroy the counter.  No operations on that counter are permitted after this.
+
+void increment_partitioned_counter(PARTITIONED_COUNTER, uint64_t amount);
+// Effect: Increment the counter by amount.
+// Requires: No overflows.  This is a 64-bit unsigned counter.
+
+uint64_t read_partitioned_counter(PARTITIONED_COUNTER) __attribute__((__visibility__("default")));
+// Effect: Return the current value of the counter.
+
+void partitioned_counters_init(void);
+// Effect: Initialize any partitioned counters data structures that must be set up before any partitioned counters run.
+
+void partitioned_counters_destroy(void);
+// Effect: Destroy any partitioned counters data structures.
+
+#if defined(__cplusplus)
+};
+#endif
+
+#if 0
+#include <pthread.h>
+#include "fttypes.h"
+
+// Used inside the PARTITIONED_COUNTER.
+struct linked_list_head {
+    struct linked_list_element *first;
+};
+
+
+class PARTITIONED_COUNTER {
+public:
+    PARTITIONED_COUNTER(void);
+    // Effect: Construct a counter, initialized to zero.
+
+    ~PARTITIONED_COUNTER(void);
+    // Effect: Destruct the counter.
+
+    void increment(uint64_t amount);
+    // Effect: Increment the counter by amount.  This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64).
+    // Requires: Don't use this from a static constructor or destructor.
+
+    uint64_t read(void);
+    // Effect: Read the sum.
+    // Requires: Don't use this from a static constructor or destructor.
+
+private:
+    uint64_t       _sum_of_dead;             // The sum of all thread-local counts from threads that have terminated.
+    pthread_key_t   _key;                     // The pthread_key which gives us the hook to construct and destruct thread-local storage.
+    struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter.
+    
+    // This function is used to destroy the thread-local part of the state when a thread terminates.
+    // But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends.
+    friend void destroy_thread_local_part_of_partitioned_counters (void *);
+};
+#endif
diff --git a/storage/tokudb/PerconaFT/util/queue.cc b/storage/tokudb/PerconaFT/util/queue.cc
new file mode 100644
index 00000000..39dfbbc6
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/queue.cc
@@ -0,0 +1,182 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <toku_portability.h>
+#include "toku_os.h"
+#include <errno.h>
+#include <toku_assert.h>
+#include "queue.h"
+#include "memory.h"
+#include <toku_pthread.h>
+
+toku_instr_key *queue_result_mutex_key;
+toku_instr_key *queue_result_cond_key;
+
+struct qitem;
+
+struct qitem {
+    void *item;
+    struct qitem *next;
+    uint64_t weight;
+};
+
+struct queue {
+    uint64_t contents_weight; // how much stuff is in there?
+    uint64_t weight_limit;    // Block enqueueing when the contents gets to be bigger than the weight.
+    struct qitem *head, *tail;
+
+    bool eof;
+
+    toku_mutex_t mutex;
+    toku_cond_t  cond;
+};
+
+// Representation invariant:
+//   q->contents_weight is the sum of the weights of everything in the queue.
+//   q->weight_limit    is the limit on the weight before we block.
+//   q->head is the oldest thing in the queue.  q->tail is the newest.  (If nothing is in the queue then both are NULL)
+//   If q->head is not null:
+//    q->head->item is the oldest item.
+//    q->head->weight is the weight of that item.
+//    q->head->next is the next youngest thing.
+//   q->eof indicates that the producer has said "that's all".
+//   q->mutex and q->cond are used as condition variables.
+
+
+int toku_queue_create (QUEUE *q, uint64_t weight_limit)
+{
+    QUEUE CALLOC(result);
+    if (result==NULL) return get_error_errno();
+    result->contents_weight = 0;
+    result->weight_limit    = weight_limit;
+    result->head = NULL;
+    result->tail = NULL;
+    result->eof = false;
+    toku_mutex_init(*queue_result_mutex_key, &result->mutex, nullptr);
+    toku_cond_init(*queue_result_cond_key, &result->cond, nullptr);
+    *q = result;
+    return 0;
+}
+
+int toku_queue_destroy (QUEUE q)
+{
+    if (q->head) return EINVAL;
+    assert(q->contents_weight==0);
+    toku_mutex_destroy(&q->mutex);
+    toku_cond_destroy(&q->cond);
+    toku_free(q);
+    return 0;
+}
+
+int toku_queue_enq (QUEUE q, void *item, uint64_t weight, uint64_t *total_weight_after_enq)
+{
+    toku_mutex_lock(&q->mutex);
+    assert(!q->eof);
+    // Go ahead and put it in, even if it's too much.
+    struct qitem *MALLOC(qi);
+    if (qi==NULL) {
+	int r = get_error_errno();
+	toku_mutex_unlock(&q->mutex);
+	return r;
+    }
+    q->contents_weight += weight;
+    qi->item = item;
+    qi->weight = weight;
+    qi->next   = NULL;
+    if (q->tail) {
+	q->tail->next = qi;
+    } else {
+	assert(q->head==NULL);
+	q->head = qi;
+    }
+    q->tail = qi;
+    // Wake up the consumer.
+    toku_cond_signal(&q->cond);
+    // Now block if there's too much stuff in there.
+    while (q->weight_limit < q->contents_weight) {
+	toku_cond_wait(&q->cond, &q->mutex);
+    }
+    // we are allowed to return.
+    if (total_weight_after_enq) {
+	*total_weight_after_enq = q->contents_weight;
+    }
+    toku_mutex_unlock(&q->mutex);
+    return 0;
+}
+
+int toku_queue_eof (QUEUE q)
+{
+    toku_mutex_lock(&q->mutex);
+    assert(!q->eof);
+    q->eof = true;
+    toku_cond_signal(&q->cond);
+    toku_mutex_unlock(&q->mutex);
+    return 0;
+}
+
+int toku_queue_deq (QUEUE q, void **item, uint64_t *weight, uint64_t *total_weight_after_deq)
+{
+    toku_mutex_lock(&q->mutex);
+    int result;
+    while (q->head==NULL && !q->eof) {
+	toku_cond_wait(&q->cond, &q->mutex);
+    }
+    if (q->head==NULL) {
+	assert(q->eof);
+	result = EOF;
+    } else {
+	struct qitem *head = q->head;
+	q->contents_weight -= head->weight;
+	*item   = head->item;
+	if (weight)
+	    *weight = head->weight;
+	if (total_weight_after_deq)
+	    *total_weight_after_deq = q->contents_weight;
+	q->head = head->next;
+	toku_free(head);
+	if (q->head==NULL) {
+	    q->tail = NULL;
+	}
+	// wake up the producer, since we decreased the contents_weight.
+	toku_cond_signal(&q->cond);
+	// Successful result.
+	result = 0;
+    }
+    toku_mutex_unlock(&q->mutex);
+    return result;
+}
diff --git a/storage/tokudb/PerconaFT/util/queue.h b/storage/tokudb/PerconaFT/util/queue.h
new file mode 100644
index 00000000..c6f1f740
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/queue.h
@@ -0,0 +1,83 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// The abstraction:
+//
+// queue.h implements a queue suitable for a producer-consumer relationship between two pthreads.
+// The enqueue/dequeue operation is fairly heavyweight (involving pthread condition variables) so it may be useful
+// to enqueue large chunks rather than small chunks.
+// It probably won't work right to have two consumer threads.
+//
+// Every item inserted into the queue has a weight.  If the weight
+// gets too big, then the queue blocks on trying to insert more items.
+// The weight can be used to limit the total number of items in the
+// queue (weight of each item=1) or the total memory consumed by queue
+// items (weight of each item is its size).  Or the weight's could all be
+// zero for an unlimited queue.
+
+typedef struct queue *QUEUE;
+
+int toku_queue_create (QUEUE *q, uint64_t weight_limit);
+// Effect: Create a queue with a given weight limit.  The queue is initially empty.
+
+int toku_queue_enq (QUEUE q, void *item, uint64_t weight, uint64_t *total_weight_after_enq);
+// Effect: Insert ITEM of weight WEIGHT into queue.  If the resulting contents weight too much then block (don't return) until the total weight is low enough.
+// If total_weight_after_enq!=NULL then return the current weight of the items in the queue (after finishing blocking on overweight, and after enqueueing the item).
+// If successful return 0.
+// If an error occurs, return the error number, and the state of the queue is undefined.  The item may have been enqueued or not, and in fact the queue may be badly corrupted if the condition variables go awry.  If it's just a matter of out-of-memory, then the queue is probably OK.
+// Requires: There is only a single consumer. (We wake up the consumer using a pthread_cond_signal (which is suitable only for single consumers.)
+
+int toku_queue_eof (QUEUE q);
+// Effect: Inform the queue that no more values will be inserted.  After all the values that have been inserted are dequeued, further dequeue operations will return EOF.
+// Returns 0 on success.   On failure, things are pretty bad (likely to be some sort of mutex failure).
+
+int toku_queue_deq (QUEUE q, void **item, uint64_t *weight, uint64_t *total_weight_after_deq);
+// Effect: Wait until the queue becomes nonempty.  Then dequeue and return the oldest item.  The item and its weight are returned in *ITEM.
+// If weight!=NULL then return the item's weight in *weight.
+// If total_weight_after_deq!=NULL then return the current weight of the items in the queue (after dequeuing the item).
+// Return 0 if an item is returned.
+// Return EOF is we no more items will be returned.
+// Usage note: The queue should be destroyed only after any consumers will no longer look at it (for example, they saw EOF).
+
+int toku_queue_destroy (QUEUE q);
+// Effect: Destroy the queue.
+// Requires: The queue must be empty and no consumer should try to dequeue after this (one way to do this is to make sure the consumer saw EOF).
+// Returns 0 on success.   If the queue is not empty, returns EINVAL.  Other errors are likely to be bad (some sort of mutex or condvar failure).
+
diff --git a/storage/tokudb/PerconaFT/util/rwlock.h b/storage/tokudb/PerconaFT/util/rwlock.h
new file mode 100644
index 00000000..d9a13ba9
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/rwlock.h
@@ -0,0 +1,348 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <toku_assert.h>
+#include <toku_portability.h>
+#include <toku_instrumentation.h>
+
+/* Readers/writers locks implementation
+ *
+ *****************************************
+ *     Overview
+ *****************************************
+ *
+ * PerconaFT employs readers/writers locks for the ephemeral locks (e.g.,
+ * on FT nodes) Why not just use the toku_pthread_rwlock API?
+ *
+ *   1) we need multiprocess rwlocks (not just multithreaded)
+ *
+ *   2) pthread rwlocks are very slow since they entail a system call
+ *   (about 2000ns on a 2GHz T2500.)
+ *
+ *     Related: We expect the common case to be that the lock is
+ *     granted
+ *
+ *   3) We are willing to employ machine-specific instructions (such
+ *   as atomic exchange, and mfence, each of which runs in about
+ *   10ns.)
+ *
+ *   4) We want to guarantee nonstarvation (many rwlock
+ *   implementations can starve the writers because another reader
+ *   comes * along before all the other readers have unlocked.)
+ *
+ *****************************************
+ *      How it works
+ *****************************************
+ *
+ * We arrange that the rwlock object is in the address space of both
+ * threads or processes.  For processes we use mmap().
+ *
+ * The rwlock struct comprises the following fields
+ *
+ *    a long mutex field (which is accessed using xchgl() or other
+ *    machine-specific instructions.  This is a spin lock.
+ *
+ *    a read counter (how many readers currently have the lock?)
+ *
+ *    a write boolean (does a writer have the lock?)
+ *
+ *    a singly linked list of semaphores for waiting requesters.  This
+ *    list is sorted oldest requester first.  Each list element
+ *    contains a semaphore (which is provided by the requestor) and a
+ *    boolean indicating whether it is a reader or a writer.
+ *
+ * To lock a read rwlock:
+ *
+ *    1) Acquire the mutex.
+ *
+ *    2) If the linked list is not empty or the writer boolean is true
+ *    then
+ *
+ *       a) initialize your semaphore (to 0),
+ *       b) add your list element to the end of the list (with  rw="read")
+ *       c) release the mutex
+ *       d) wait on the semaphore
+ *       e) when the semaphore release, return success.
+ *
+ *    3) Otherwise increment the reader count, release the mutex, and
+ *    return success.
+ *
+ * To lock the write rwlock is almost the same.
+ *     1) Acquire the mutex
+ *     2) If the list is not empty or the reader count is nonzero
+ *        a) initialize semaphore
+ *        b) add to end of list (with rw="write")
+ *        c) release mutex
+ *        d) wait on the semaphore
+ *        e) return success when the semaphore releases
+ *     3) Otherwise set writer=true, release mutex and return success.
+ *
+ * To unlock a read rwlock:
+ *     1) Acquire mutex
+ *     2) Decrement reader count
+ *     3) If the count is still positive or the list is empty then
+ *        return success
+ *     4) Otherwise (count==zero and the list is nonempty):
+ *        a) If the first element of the list is a reader:
+ *            i) while the first element is a reader:
+ *                 x) pop the list
+ *                 y) increment the reader count
+ *                 z) increment the semaphore (releasing it for some waiter)
+ *            ii) return success
+ *        b) Else if the first element is a writer
+ *            i) pop the list
+ *            ii) set writer to true
+ *            iii) increment the semaphore
+ *            iv) return success
+ */
+
+//Use case:
+// A read lock is acquired by threads that get and pin an entry in the
+// cachetable. A write lock is acquired by the writer thread when an entry
+// is evicted from the cachetable and is being written storage.
+
+//Use case:
+// General purpose reader writer lock with properties:
+// 1. multiple readers, no writers
+// 2. one writer at a time
+// 3. pending writers have priority over pending readers
+
+// An external mutex must be locked when using these functions.  An alternate
+// design would bury a mutex into the rwlock itself.  While this may
+// increase parallelism at the expense of single thread performance, we
+// are experimenting with a single higher level lock.
+
+extern toku_instr_key *rwlock_cond_key;
+extern toku_instr_key *rwlock_wait_read_key;
+extern toku_instr_key *rwlock_wait_write_key;
+
+typedef struct st_rwlock *RWLOCK;
+struct st_rwlock {
+    int reader;     // the number of readers
+    int want_read;  // the number of blocked readers
+    toku_cond_t wait_read;
+    int writer;                  // the number of writers
+    int want_write;              // the number of blocked writers
+    toku_cond_t wait_write;
+    toku_cond_t *wait_users_go_to_zero;
+#if defined(TOKU_MYSQL_WITH_PFS)
+    toku_pthread_rwlock_t prwlock;
+#endif
+};
+
+// returns: the sum of the number of readers, pending readers, writers, and
+// pending writers
+
+static inline int rwlock_users(RWLOCK rwlock) {
+    return rwlock->reader + rwlock->want_read + rwlock->writer +
+           rwlock->want_write;
+}
+
+#if defined(TOKU_MYSQL_WITH_PFS)
+#define rwlock_init(K, R) inline_rwlock_init(K, R)
+#else
+#define rwlock_init(K, R) inline_rwlock_init(R)
+#endif
+
+// initialize a read write lock
+static inline __attribute__((__unused__)) void inline_rwlock_init(
+#if defined(TOKU_MYSQL_WITH_PFS)
+    const toku_instr_key &rwlock_instr_key,
+#endif
+    RWLOCK rwlock) {
+#if defined(TOKU_MYSQL_WITH_PFS)
+    toku_pthread_rwlock_init(rwlock_instr_key, &rwlock->prwlock, nullptr);
+#endif
+    rwlock->reader = rwlock->want_read = 0;
+    rwlock->writer = rwlock->want_write = 0;
+    toku_cond_init(toku_uninstrumented, &rwlock->wait_read, nullptr);
+    toku_cond_init(toku_uninstrumented, &rwlock->wait_write, nullptr);
+    rwlock->wait_users_go_to_zero = NULL;
+}
+
+// destroy a read write lock
+
+static inline __attribute__((__unused__)) void rwlock_destroy(RWLOCK rwlock) {
+    paranoid_invariant(rwlock->reader == 0);
+    paranoid_invariant(rwlock->want_read == 0);
+    paranoid_invariant(rwlock->writer == 0);
+    paranoid_invariant(rwlock->want_write == 0);
+    toku_cond_destroy(&rwlock->wait_read);
+    toku_cond_destroy(&rwlock->wait_write);
+#if defined(TOKU_MYSQL_WITH_PFS)
+    toku_pthread_rwlock_destroy(&rwlock->prwlock);
+#endif
+}
+
+// obtain a read lock
+// expects: mutex is locked
+
+static inline void rwlock_read_lock(RWLOCK rwlock, toku_mutex_t *mutex) {
+#ifdef TOKU_MYSQL_WITH_PFS
+    /* Instrumentation start */
+    toku_rwlock_instrumentation rwlock_instr;
+    // TODO: pull location information up to caller
+    toku_instr_rwlock_rdlock_wait_start(
+        rwlock_instr, rwlock->prwlock, __FILE__, __LINE__);
+
+#endif
+
+    paranoid_invariant(!rwlock->wait_users_go_to_zero);
+    if (rwlock->writer || rwlock->want_write) {
+        rwlock->want_read++;
+        while (rwlock->writer || rwlock->want_write) {
+            toku_cond_wait(&rwlock->wait_read, mutex);
+        }
+        rwlock->want_read--;
+    }
+    rwlock->reader++;
+#ifdef TOKU_MYSQL_WITH_PFS
+    /* Instrumentation end */
+    toku_instr_rwlock_wrlock_wait_end(rwlock_instr, 0);
+#endif
+}
+
+// release a read lock
+// expects: mutex is locked
+
+static inline void rwlock_read_unlock(RWLOCK rwlock) {
+#ifdef TOKU_MYSQL_WITH_PFS
+    toku_instr_rwlock_unlock(rwlock->prwlock);
+#endif
+    paranoid_invariant(rwlock->reader > 0);
+    paranoid_invariant(rwlock->writer == 0);
+    rwlock->reader--;
+    if (rwlock->reader == 0 && rwlock->want_write) {
+        toku_cond_signal(&rwlock->wait_write);
+    }
+    if (rwlock->wait_users_go_to_zero && rwlock_users(rwlock) == 0) {
+        toku_cond_signal(rwlock->wait_users_go_to_zero);
+    }
+}
+
+// obtain a write lock
+// expects: mutex is locked
+
+static inline void rwlock_write_lock(RWLOCK rwlock, toku_mutex_t *mutex) {
+#ifdef TOKU_MYSQL_WITH_PFS
+    /* Instrumentation start */
+    toku_rwlock_instrumentation rwlock_instr;
+    toku_instr_rwlock_wrlock_wait_start(
+        rwlock_instr, rwlock->prwlock, __FILE__, __LINE__);
+#endif
+    paranoid_invariant(!rwlock->wait_users_go_to_zero);
+    if (rwlock->reader || rwlock->writer) {
+        rwlock->want_write++;
+        while (rwlock->reader || rwlock->writer) {
+            toku_cond_wait(&rwlock->wait_write, mutex);
+        }
+        rwlock->want_write--;
+    }
+    rwlock->writer++;
+#if defined(TOKU_MYSQL_WITH_PFS)
+    /* Instrumentation end */
+    toku_instr_rwlock_wrlock_wait_end(rwlock_instr, 0);
+#endif
+}
+
+// release a write lock
+// expects: mutex is locked
+
+static inline void rwlock_write_unlock(RWLOCK rwlock) {
+#if defined(TOKU_MYSQL_WITH_PFS)
+    toku_instr_rwlock_unlock(rwlock->prwlock);
+#endif
+    paranoid_invariant(rwlock->reader == 0);
+    paranoid_invariant(rwlock->writer == 1);
+    rwlock->writer--;
+    if (rwlock->want_write) {
+        toku_cond_signal(&rwlock->wait_write);
+    } else if (rwlock->want_read) {
+        toku_cond_broadcast(&rwlock->wait_read);
+    }    
+    if (rwlock->wait_users_go_to_zero && rwlock_users(rwlock) == 0) {
+        toku_cond_signal(rwlock->wait_users_go_to_zero);
+    }
+}
+
+// returns: the number of readers
+
+static inline int rwlock_readers(RWLOCK rwlock) {
+    return rwlock->reader;
+}
+
+// returns: the number of readers who are waiting for the lock
+
+static inline int rwlock_blocked_readers(RWLOCK rwlock) {
+    return rwlock->want_read;
+}
+
+// returns: the number of writers who are waiting for the lock
+
+static inline int rwlock_blocked_writers(RWLOCK rwlock) {
+    return rwlock->want_write;
+}
+
+// returns: the number of writers
+
+static inline int rwlock_writers(RWLOCK rwlock) {
+    return rwlock->writer;
+}
+
+static inline bool rwlock_write_will_block(RWLOCK rwlock) {
+    return (rwlock->writer > 0 || rwlock->reader > 0);
+}
+
+static inline int rwlock_read_will_block(RWLOCK rwlock) {
+    return (rwlock->writer > 0 || rwlock->want_write > 0);
+}
+
+static inline void rwlock_wait_for_users(RWLOCK rwlock, toku_mutex_t *mutex) {
+    paranoid_invariant(!rwlock->wait_users_go_to_zero);
+    toku_cond_t cond;
+    toku_cond_init(toku_uninstrumented, &cond, nullptr);
+    while (rwlock_users(rwlock) > 0) {
+        rwlock->wait_users_go_to_zero = &cond;
+        toku_cond_wait(&cond, mutex);
+    }
+    rwlock->wait_users_go_to_zero = NULL;
+    toku_cond_destroy(&cond);
+}
+
diff --git a/storage/tokudb/PerconaFT/util/scoped_malloc.cc b/storage/tokudb/PerconaFT/util/scoped_malloc.cc
new file mode 100644
index 00000000..6c4fb95a
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/scoped_malloc.cc
@@ -0,0 +1,227 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <portability/memory.h>
+
+#include <util/scoped_malloc.h>
+
+// The __thread storage class modifier isn't well supported on osx, but we
+// aren't worried about the performance on osx, so we provide a
+// pass-through implementation of scoped mallocs.
+#ifdef __APPLE__
+
+namespace toku {
+
+    scoped_malloc::scoped_malloc(const size_t size)
+        : m_size(size),
+          m_local(false),
+          m_buf(toku_xmalloc(size)) {}
+
+    scoped_malloc::~scoped_malloc() {
+        toku_free(m_buf);
+    }
+
+} // namespace toku
+
+void toku_scoped_malloc_init(void) {}
+void toku_scoped_malloc_destroy(void) {}
+void toku_scoped_malloc_destroy_set(void) {}
+void toku_scoped_malloc_destroy_key(void) {}
+
+#else // __APPLE__
+
+#include <set>
+#include <pthread.h>
+
+#include <portability/toku_pthread.h>
+
+namespace toku {
+
+    // see pthread_key handling at the bottom
+    //
+    // when we use gcc 4.8, we can use the 'thread_local' keyword and proper c++
+    // constructors/destructors instead of this pthread / global set wizardy.
+    static pthread_key_t tl_stack_destroy_pthread_key;
+    class tl_stack;
+    std::set<tl_stack *> *global_stack_set;
+    toku_mutex_t global_stack_set_mutex = TOKU_MUTEX_INITIALIZER;
+
+    class tl_stack {
+        // 1MB
+        static const size_t STACK_SIZE = 1 * 1024 * 1024;
+        
+    public:
+        void init() {
+            m_stack = reinterpret_cast<char *>(toku_xmalloc(STACK_SIZE));
+            m_current_offset = 0;
+            int r = pthread_setspecific(tl_stack_destroy_pthread_key, this);
+            invariant_zero(r);
+        }
+
+        void destroy() {
+#if defined(TOKU_SCOPED_MALLOC_DEBUG) && TOKU_SCOPED_MALLOC_DEBUG
+            printf("%s %p %p\n", __FUNCTION__, this, m_stack);
+#endif
+            if (m_stack != NULL) {
+                toku_free(m_stack);
+                m_stack = NULL;
+            }
+        }
+
+        // initialize a tl_stack and insert it into the global map
+        static void init_and_register(tl_stack *st) {
+            st->init();
+            invariant_notnull(global_stack_set);
+
+            toku_mutex_lock(&global_stack_set_mutex);
+            std::pair<std::set<tl_stack *>::iterator, bool> p = global_stack_set->insert(st);
+            invariant(p.second);
+            toku_mutex_unlock(&global_stack_set_mutex);
+        }
+
+        // destruct a tl_stack and remove it from the global map
+        // passed in as void * to match the generic pthread destructor API
+        static void destroy_and_deregister(void *key) {
+            invariant_notnull(key);
+            tl_stack *st = reinterpret_cast<tl_stack *>(key);
+
+            size_t n = 0;
+            toku_mutex_lock(&global_stack_set_mutex);
+            if (global_stack_set) {
+                n = global_stack_set->erase(st);
+            }
+            toku_mutex_unlock(&global_stack_set_mutex);
+
+            if (n == 1) {
+                st->destroy(); // destroy the stack if this function erased it from the set.  otherwise, somebody else destroyed it.
+            }
+        }
+
+        // Allocate 'size' bytes and return a pointer to the first byte
+        void *alloc(const size_t size) {
+            if (m_stack == NULL) {
+                init_and_register(this);
+            }
+            invariant(m_current_offset + size <= STACK_SIZE);
+            void *mem = &m_stack[m_current_offset];
+            m_current_offset += size;
+            return mem;
+        }
+
+        // Give back a previously allocated region of 'size' bytes.
+        void dealloc(const size_t size) {
+            invariant(m_current_offset >= size);
+            m_current_offset -= size;
+        }
+
+        // Get the current size of free-space in bytes.
+        size_t get_free_space() const {
+            invariant(m_current_offset <= STACK_SIZE);
+            return STACK_SIZE - m_current_offset;
+        }
+
+    private:
+        // Offset of the free region in the stack
+        size_t m_current_offset;
+        char *m_stack;
+    };
+
+    // Each thread has its own local stack.
+    static __thread tl_stack local_stack;
+
+    // Memory is allocated from thread-local storage if available, otherwise from malloc(1).
+    scoped_malloc::scoped_malloc(const size_t size) :
+        m_size(size),
+        m_local(local_stack.get_free_space() >= m_size),
+        m_buf(m_local ? local_stack.alloc(m_size) : toku_xmalloc(m_size)) {
+    }
+
+    scoped_malloc::~scoped_malloc() {
+        if (m_local) {
+            local_stack.dealloc(m_size);
+        } else {
+            toku_free(m_buf);
+        }
+    }
+
+} // namespace toku
+
+// pthread key handling:
+// - there is a process-wide pthread key that is associated with the destructor for a tl_stack
+// - on process construction, we initialize the key; on destruction, we clean it up.
+// - when a thread first uses its tl_stack, it calls pthread_setspecific(&destroy_key, "some key"),
+//   associating the destroy key with the tl_stack_destroy_and_deregister destructor
+// - when a thread terminates, it calls the associated destructor; tl_stack_destroy_and_deregister.
+
+void toku_scoped_malloc_init(void) {
+    toku_mutex_lock(&toku::global_stack_set_mutex);
+    invariant_null(toku::global_stack_set);
+    toku::global_stack_set = new std::set<toku::tl_stack *>();
+    toku_mutex_unlock(&toku::global_stack_set_mutex);
+
+    int r = pthread_key_create(&toku::tl_stack_destroy_pthread_key,
+                               toku::tl_stack::destroy_and_deregister);
+    invariant_zero(r);
+}
+
+void toku_scoped_malloc_destroy(void) {
+    toku_scoped_malloc_destroy_key();
+    toku_scoped_malloc_destroy_set();
+}
+
+void toku_scoped_malloc_destroy_set(void) {
+    toku_mutex_lock(&toku::global_stack_set_mutex);
+    invariant_notnull(toku::global_stack_set);
+    // Destroy any tl_stacks that were registered as thread locals but did not
+    // get a chance to clean up using the pthread key destructor (because this code
+    // is now running before those threads fully shutdown)
+    for (std::set<toku::tl_stack *>::iterator i = toku::global_stack_set->begin();
+         i != toku::global_stack_set->end(); i++) {
+        (*i)->destroy();
+    }
+    delete toku::global_stack_set;
+    toku::global_stack_set = nullptr;
+    toku_mutex_unlock(&toku::global_stack_set_mutex);
+}
+
+void toku_scoped_malloc_destroy_key(void) {
+    int r = pthread_key_delete(toku::tl_stack_destroy_pthread_key);
+    invariant_zero(r);
+}
+
+#endif // !__APPLE__
diff --git a/storage/tokudb/PerconaFT/util/scoped_malloc.h b/storage/tokudb/PerconaFT/util/scoped_malloc.h
new file mode 100644
index 00000000..b95b687a
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/scoped_malloc.h
@@ -0,0 +1,103 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+namespace toku {
+
+    class scoped_malloc {
+    public:
+        // Memory is allocated from thread-local storage if available, otherwise from malloc(3).
+        scoped_malloc(const size_t size);
+
+        ~scoped_malloc();
+
+        void *get() const {
+            return m_buf;
+        }
+
+    private:
+        // Non-copyable
+        scoped_malloc();
+
+        const size_t m_size;
+        const bool m_local;
+        void *const m_buf;
+    };
+
+    class scoped_calloc : public scoped_malloc {
+    public:
+        // A scoped malloc whose bytes are initialized to zero, as in calloc(3)
+        scoped_calloc(const size_t size) :
+            scoped_malloc(size) {
+            memset(scoped_malloc::get(), 0, size);
+        }
+    };
+
+    class scoped_malloc_aligned : public scoped_malloc {
+    public:
+        scoped_malloc_aligned(const size_t size, const size_t alignment) :
+            scoped_malloc(size + alignment) {
+            invariant(size >= alignment);
+            invariant(alignment > 0);
+            const uintptr_t addr = reinterpret_cast<uintptr_t>(scoped_malloc::get());
+            const uintptr_t aligned_addr = (addr + alignment) - (addr % alignment);
+            invariant(aligned_addr < addr + size + alignment);
+            m_aligned_buf = reinterpret_cast<char *>(aligned_addr);
+        }
+
+        void *get() const {
+            return m_aligned_buf;
+        }
+
+    private:
+        void *m_aligned_buf;
+    };
+
+} // namespace toku
+
+void toku_scoped_malloc_init(void);
+
+void toku_scoped_malloc_destroy(void);
+
+void toku_scoped_malloc_destroy_set(void);
+
+void toku_scoped_malloc_destroy_key(void);
+
diff --git a/storage/tokudb/PerconaFT/util/sort.h b/storage/tokudb/PerconaFT/util/sort.h
new file mode 100644
index 00000000..0f0bb7ee
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/sort.h
@@ -0,0 +1,208 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+#include <memory.h>
+
+namespace toku {
+
+    template<typename sortdata_t, typename sortextra_t, int (*cmp)(sortextra_t &, const sortdata_t &, const sortdata_t &)>
+    struct sort {
+
+        static const int single_threaded_threshold = 10000;
+
+        /**
+         * Effect: Sort n elements of type sortdata_t in the array a.
+         *   Elements are compared by the template parameter cmp, using
+         *   the context in extra.
+         */
+        static int
+        mergesort_r(sortdata_t *a, const int n, sortextra_t &extra)
+        {
+            sortdata_t *as[2] = { a, nullptr };
+            if (n >= single_threaded_threshold) {
+                XMALLOC_N(n, as[1]);
+            }
+            int which = mergesort_internal(as, 0, n, extra);
+            if (which == 1) {
+                memcpy(a, as[1], n * (sizeof a[0]));
+            }
+            if (n >= single_threaded_threshold) {
+                toku_free(as[1]);
+            }
+            return 0;
+        }
+
+    private:
+
+        // Sorts the data in as[which].  Returns dest such that as[dest]
+        // contains the sorted data (might be which or 1-which).
+        static int
+        mergesort_internal(sortdata_t *as[2], const int which, const int n, sortextra_t &extra)
+        {
+            if (n <= 1) { return which; }
+            if (n < single_threaded_threshold) {
+                quicksort_r(as[which], n, extra);
+                return which;
+            }
+            const int mid = n / 2;
+            sortdata_t *right_as[2] = { &(as[0])[mid], &(as[1])[mid] };
+            const int r1 = mergesort_internal(as, which, mid, extra);
+            const int r2 = mergesort_internal(right_as, which, n - mid, extra);
+            if (r1 != r2) {
+                // move everything to the same place (r2)
+                memcpy(as[r2], as[r1], mid * (sizeof as[r2][0]));
+            }
+            // now as[r2] has both sorted arrays
+            const int dest = 1 - r2;
+            merge(&(as[dest])[0], &(as[1-dest])[0], mid, &(as[1-dest])[mid], n - mid, extra);
+            return dest;
+        }
+
+        static void
+        merge_c(sortdata_t *dest, const sortdata_t *a, const int an, const sortdata_t *b, const int bn, sortextra_t &extra)
+        {
+            int ai, bi, i;
+            for (ai = 0, bi = 0, i = 0; ai < an && bi < bn; ++i) {
+                if (cmp(extra, a[ai], b[bi]) < 0) {
+                    dest[i] = a[ai];
+                    ai++;
+                } else {
+                    dest[i] = b[bi];
+                    bi++;
+                }
+            }
+            if (ai < an) {
+                memcpy(&dest[i], &a[ai], (an - ai) * (sizeof dest[0]));
+            } else if (bi < bn) {
+                memcpy(&dest[i], &b[bi], (bn - bi) * (sizeof dest[0]));
+            }
+        }
+
+        static int
+        binsearch(const sortdata_t &key, const sortdata_t *a, const int n, const int abefore, sortextra_t &extra)
+        {
+            if (n == 0) {
+                return abefore;
+            }
+            const int mid = n / 2;
+            const sortdata_t *akey = &a[mid];
+            int c = cmp(extra, key, *akey);
+            if (c < 0) {
+                if (n == 1) {
+                    return abefore;
+                } else {
+                    return binsearch(key, a, mid, abefore, extra);
+                }
+            } else if (c > 0) {
+                if (n == 1) {
+                    return abefore + 1;
+                } else {
+                    return binsearch(key, akey, n - mid, abefore + mid, extra);
+                }
+            } else {
+                return abefore + mid;
+            }
+        }
+
+        static void
+        merge(sortdata_t *dest, const sortdata_t *a_, const int an_, const sortdata_t *b_, const int bn_, sortextra_t &extra)
+        {
+            if (an_ + bn_ < single_threaded_threshold) {
+                merge_c(dest, a_, an_, b_, bn_, extra);
+            } else {
+                const bool swapargs = an_ < bn_;
+                const sortdata_t *a = swapargs ? b_ : a_;
+                const sortdata_t *b = swapargs ? a_ : b_;
+                const int an = swapargs ? bn_ : an_;
+                const int bn = swapargs ? an_ : bn_;
+
+                const int a2 = an / 2;
+                const sortdata_t *akey = &a[a2];
+                const int b2 = binsearch(*akey, b, bn, 0, extra);
+                merge(dest, a, a2, b, b2, extra);
+                merge(&dest[a2 + b2], akey, an - a2, &b[b2], bn - b2, extra);
+            }
+        }
+
+        static void
+        quicksort_r(sortdata_t *a, const int n, sortextra_t &extra)
+        {
+            if (n > 1) {
+                const int lo = 0;
+                int pivot = n / 2;
+                const int hi = n - 1;
+                if (cmp(extra, a[lo], a[pivot]) > 0) {
+                    const sortdata_t tmp = a[lo]; a[lo] = a[pivot]; a[pivot] = tmp;
+                }
+                if (cmp(extra, a[pivot], a[hi]) > 0) {
+                    const sortdata_t tmp = a[pivot]; a[pivot] = a[hi]; a[hi] = tmp;
+                    if (cmp(extra, a[lo], a[pivot]) > 0) {
+                        const sortdata_t tmp2 = a[lo]; a[lo] = a[pivot]; a[pivot] = tmp2;
+                    }
+                }
+                int li = lo + 1, ri = hi - 1;
+                while (li <= ri) {
+                    while (cmp(extra, a[li], a[pivot]) < 0) {
+                        li++;
+                    }
+                    while (cmp(extra, a[pivot], a[ri]) < 0) {
+                        ri--;
+                    }
+                    if (li < ri) {
+                        sortdata_t tmp = a[li]; a[li] = a[ri]; a[ri] = tmp;
+                        // fix up pivot if we moved it
+                        if (pivot == li) { pivot = ri; }
+                        else if (pivot == ri) { pivot = li; }
+                        li++;
+                        ri--;
+                    } else if (li == ri) {
+                        li++;
+                        ri--;
+                    }
+                }
+
+                quicksort_r(&a[lo], ri + 1, extra);
+                quicksort_r(&a[li], hi - li + 1, extra);
+            }
+        }
+    };
+
+};
diff --git a/storage/tokudb/PerconaFT/util/status.h b/storage/tokudb/PerconaFT/util/status.h
new file mode 100644
index 00000000..2d03ef1e
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/status.h
@@ -0,0 +1,75 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <util/partitioned_counter.h>
+#include <util/constexpr.h>
+
+#define TOKUFT_STATUS_INIT(array,k,c,t,l,inc) do {   \
+    array.status[k].keyname = #k;                    \
+    array.status[k].columnname = #c;                 \
+    array.status[k].type    = t;                     \
+    array.status[k].legend  = l;                     \
+    static_assert((inc) != 0, "Var must be included in at least one place"); \
+    constexpr_static_assert(strcmp(#c, "NULL") && strcmp(#c, "0"),           \
+            "Use nullptr for no column name instead of NULL, 0, etc...");    \
+    constexpr_static_assert((inc) == TOKU_ENGINE_STATUS                      \
+            || strcmp(#c, "nullptr"), "Missing column name.");               \
+    constexpr_static_assert(static_strncasecmp(#c, "TOKU", strlen("TOKU")),  \
+                  "Do not start column names with toku."); \
+    array.status[k].include = static_cast<toku_engine_status_include_type>(inc);  \
+    if (t == PARCOUNT) {                                               \
+        array.status[k].value.parcount = create_partitioned_counter(); \
+    }                                                                  \
+} while (0)
+
diff --git a/storage/tokudb/PerconaFT/util/tests/CMakeLists.txt b/storage/tokudb/PerconaFT/util/tests/CMakeLists.txt
new file mode 100644
index 00000000..8d53dd89
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/CMakeLists.txt
@@ -0,0 +1,24 @@
+if(BUILD_TESTING)
+  file(GLOB srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" *.cc)
+  foreach(src ${srcs})
+    get_filename_component(base ${src} NAME_WE)
+    list(APPEND tests ${base})
+  endforeach(src)
+
+  foreach(test ${tests})
+    add_executable(${test} ${test}.cc)
+    target_link_libraries(${test} util ${LIBTOKUPORTABILITY})
+  endforeach(test)
+
+  add_helgrind_test(util helgrind_test_partitioned_counter $<TARGET_FILE:test_partitioned_counter>)
+  add_helgrind_test(util helgrind_test_partitioned_counter_5833 $<TARGET_FILE:test_partitioned_counter_5833>)
+
+  foreach(test ${tests})
+    add_test(util/${test} ${test})
+  endforeach(test)
+
+  set(long_tests
+    util/helgrind_test_partitioned_counter
+    )
+  set_tests_properties(${long_tests} PROPERTIES TIMEOUT 3600)
+endif(BUILD_TESTING)
diff --git a/storage/tokudb/PerconaFT/util/tests/marked-omt-test.cc b/storage/tokudb/PerconaFT/util/tests/marked-omt-test.cc
new file mode 100644
index 00000000..7e60c711
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/marked-omt-test.cc
@@ -0,0 +1,466 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "test.h"
+
+#include <toku_portability.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <db.h>
+#include <string.h>
+
+#include <memory.h>
+
+#include <portability/toku_atomic.h>
+#include <portability/toku_pthread.h>
+#include <portability/toku_random.h>
+
+#include <util/omt.h>
+#include <util/rwlock.h>
+
+namespace toku {
+
+namespace test {
+
+static inline uint32_t fudge(const uint32_t x) { return x + 300; }
+static inline uint32_t defudge(const uint32_t fx) { return fx - 300; }
+
+int test_iterator(const uint32_t &v, const uint32_t idx, bool *const UU(unused));
+int test_iterator(const uint32_t &v, const uint32_t idx, bool *const UU(unused)) {
+    invariant(defudge(v) == idx);
+    return 0;
+}
+
+int check_iterator_before(const uint32_t &v, const uint32_t idx, bool *const called);
+int check_iterator_before(const uint32_t &v, const uint32_t idx, bool *const called) {
+    invariant(defudge(v) == idx);
+    invariant(idx % 10 < 5);
+    called[idx] = true;
+    return 0;
+}
+
+int check_iterator_after(const uint32_t &v, const uint32_t UU(idx), bool *const called);
+int check_iterator_after(const uint32_t &v, const uint32_t UU(idx), bool *const called) {
+    invariant(defudge(v) % 10 >= 5);
+    called[defudge(v)] = true;
+    return 0;
+}
+
+int die(const uint32_t &UU(v), const uint32_t UU(idx), void *const UU(unused));
+int die(const uint32_t &UU(v), const uint32_t UU(idx), void *const UU(unused)) {
+    abort();
+    return 0; // hahaha
+}
+
+static void run_test(uint32_t nelts) {
+    assert(nelts % 10 == 0);  // run_test depends on nelts being a multiple of 10
+
+    omt<uint32_t, uint32_t, true> omt;
+    omt.create();
+    omt.verify_marks_consistent();
+    for (uint32_t i = 0; i < nelts; ++i) {
+        omt.insert_at(fudge(i), i);
+    }
+    omt.verify_marks_consistent();
+
+    int r;
+    for (uint32_t i = 0; i < nelts / 10; ++i) {
+        r = omt.iterate_and_mark_range<bool, test_iterator>(i * 10, i * 10 + 5, nullptr);
+        invariant_zero(r);
+        omt.verify_marks_consistent();
+    }
+
+    bool called[nelts];
+    ZERO_ARRAY(called);
+    r = omt.iterate_over_marked<bool, check_iterator_before>(called);
+    invariant_zero(r);
+    for (uint32_t i = 0; i < nelts; ++i) {
+        if (i % 10 < 5) {
+            invariant(called[i]);
+        } else {
+            invariant(!called[i]);
+        }
+    }
+    omt.verify_marks_consistent();
+
+    invariant(omt.size() == nelts);
+
+    omt.delete_all_marked();
+    omt.verify_marks_consistent();
+
+    invariant(omt.size() * 2 == nelts);
+
+    r = omt.iterate_over_marked<void, die>(nullptr);
+    invariant_zero(r);
+
+    ZERO_ARRAY(called);
+    r = omt.iterate<bool, check_iterator_after>(called);
+    invariant_zero(r);
+    omt.verify_marks_consistent();
+
+    for (uint32_t i = 0; i < nelts; ++i) {
+        if (i % 10 < 5) {
+            invariant(!called[i]);
+        } else {
+            invariant(called[i]);
+        }
+    }
+
+    omt.destroy();
+}
+
+typedef omt<uint32_t, uint32_t, true> stress_omt;
+
+int int_heaviside(const uint32_t &v, const uint32_t &target);
+int int_heaviside(const uint32_t &v, const uint32_t &target) {
+    return (v > target) - (v < target);
+}
+
+struct stress_shared {
+    stress_omt *omt;
+    volatile bool running;
+    struct st_rwlock lock;
+    toku_mutex_t mutex;
+    int num_marker_threads;
+};
+
+struct reader_extra {
+    int tid;
+    stress_shared *shared;
+    uint64_t iterations;
+    uint64_t last_iteration;
+    char buf_read[8];
+    char buf_write[8];
+    struct random_data rand_read;
+    struct random_data rand_write;
+};
+
+static void generate_range(struct random_data *rng, const struct stress_shared &shared, uint32_t *begin, uint32_t *limit) {
+    const uint32_t nelts = shared.omt->size();
+    double range_limit_d = nelts;
+    range_limit_d /= 1000;
+    range_limit_d /= shared.num_marker_threads;
+    range_limit_d += 1;
+    uint32_t range_limit = static_cast<uint32_t>(range_limit_d);
+    if (range_limit < 5) {
+        range_limit = 5;
+    }
+    if (range_limit > 1000) {
+        range_limit = 1000;
+    }
+    *begin = rand_choices(rng, nelts - 1);
+    if (*begin + range_limit > nelts) {
+        range_limit = nelts - *begin;
+    }
+    *limit = *begin + rand_choices(rng, range_limit);
+}
+
+struct pair {
+    uint32_t begin;
+    uint32_t limit;
+};
+
+int mark_read_iterator(const uint32_t &UU(v), const uint32_t idx, struct pair * const pair);
+int mark_read_iterator(const uint32_t &UU(v), const uint32_t idx, struct pair * const pair) {
+    invariant(defudge(v) == idx);
+    invariant(idx >= pair->begin);
+    invariant(idx < pair->limit);
+    return 0;
+}
+
+static void *stress_mark_worker(void *extrav) {
+    struct reader_extra *CAST_FROM_VOIDP(extra, extrav);
+    struct stress_shared &shared = *extra->shared;
+    toku_mutex_t &mutex = shared.mutex;
+
+    while (shared.running) {
+        toku_mutex_lock(&mutex);
+        rwlock_read_lock(&shared.lock, &mutex);
+        toku_mutex_unlock(&mutex);
+
+        struct pair range;
+        generate_range(&extra->rand_read, shared, &range.begin, &range.limit);
+
+        shared.omt->iterate_and_mark_range<pair, mark_read_iterator>(range.begin, range.limit, &range);
+
+        ++extra->iterations;
+
+        toku_mutex_lock(&mutex);
+        rwlock_read_unlock(&shared.lock);
+        toku_mutex_unlock(&mutex);
+
+        usleep(1);
+    }
+
+    return nullptr;
+}
+
+template<typename T>
+class array_ftor {
+    int m_count;
+    T *m_array;
+public:
+    array_ftor(int size) : m_count(0) {
+        XMALLOC_N(size, m_array);
+    }
+    ~array_ftor() {
+        toku_free(m_array);
+    }
+    void operator() (const T &x) { m_array[m_count++] = x; }
+    template<class callback_t>
+    void iterate(callback_t &cb) const {
+        for (int i = 0; i < m_count; ++i) {
+            cb(m_array[i]);
+        }
+    }
+};
+
+int use_array_ftor(const uint32_t &v, const uint32_t UU(idx), array_ftor<uint32_t> *const fp);
+int use_array_ftor(const uint32_t &v, const uint32_t UU(idx), array_ftor<uint32_t> *const fp) {
+    array_ftor<uint32_t> &f = *fp;
+    f(v);
+    return 0;
+}
+
+class inserter {
+    stress_omt *m_omt;
+public:
+    inserter(stress_omt *omt) : m_omt(omt) {}
+    void operator() (const uint32_t &x) {
+        m_omt->insert<uint32_t, int_heaviside>(x, x, nullptr);
+    }
+};
+
+/*
+ * split the range evenly/not evenly between marker threads
+ * context tells it the range
+ * context also holds iteration number
+ *
+ * N threads
+ * N 'contexts' holds iteration number, seed
+ *
+ * create rng based on seed
+ * loop:
+ *   generate random range.  Mark that range, increment iteration number
+ *
+ *
+ * 
+ *
+ * for each context
+     * create rng based on context->last_seed
+     *   loop (iteration number times)
+     *     mark (in array) random range
+     * context->last_seed := context->seed
+ * check the array and the omt
+ *
+ */
+
+static void simulate_reader_marks_on_array(struct reader_extra *const reader, const struct stress_shared &shared, bool *const should_be_marked) {
+    if (verbose) {
+        fprintf(stderr, "thread %d ran %" PRIu64 " iterations\n", reader->tid, reader->iterations - reader->last_iteration);
+    }
+    for (; reader->last_iteration < reader->iterations; ++reader->last_iteration) {
+        uint32_t begin;
+        uint32_t limit;
+
+        generate_range(&reader->rand_write, shared, &begin, &limit);
+
+        for (uint32_t i = begin; i < limit; i++) {
+            should_be_marked[i] = true;
+        }
+    }
+}
+
+int copy_marks(const uint32_t &v, const uint32_t idx, bool * const is_marked);
+int copy_marks(const uint32_t &v, const uint32_t idx, bool * const is_marked) {
+    invariant(defudge(v) == idx);
+    is_marked[idx] = true;
+    return 0;
+}
+
+static inline uint32_t count_true(const bool *const bools, uint32_t n) {
+    uint32_t count = 0;
+    for (uint32_t i = 0; i < n; ++i) {
+        if (bools[i]) {
+            ++count;
+        }
+    }
+    return count;
+}
+
+static void stress_deleter(struct reader_extra *const readers, int num_marker_threads, stress_omt *omt) {
+    // Verify (iterate_over_marked) agrees exactly with iterate_and_mark_range (multithreaded)
+    stress_shared &shared = *readers[0].shared;
+    bool should_be_marked[omt->size()];
+    ZERO_ARRAY(should_be_marked);
+
+    for (int i = 0; i < num_marker_threads; i++) {
+        simulate_reader_marks_on_array(&readers[i], shared, should_be_marked);
+    }
+
+    bool is_marked_according_to_iterate[omt->size()];
+    ZERO_ARRAY(is_marked_according_to_iterate);
+
+    omt->verify_marks_consistent();
+    omt->iterate_over_marked<bool, copy_marks>(&is_marked_according_to_iterate[0]);
+    omt->verify_marks_consistent();
+
+    invariant(!memcmp(should_be_marked, is_marked_according_to_iterate, sizeof(should_be_marked)));
+
+    if (verbose) {
+        double frac_marked = count_true(should_be_marked, omt->size());
+        frac_marked /= omt->size();
+
+        fprintf(stderr, "Marked: %0.4f\n", frac_marked);
+        omt->verify_marks_consistent();
+    }
+
+    array_ftor<uint32_t> aftor(omt->size());
+    omt->iterate_over_marked<array_ftor<uint32_t>, use_array_ftor>(&aftor);
+    omt->delete_all_marked();
+    omt->verify_marks_consistent();
+    omt->iterate_over_marked<void, die>(nullptr);
+    inserter ins(omt);
+    aftor.iterate(ins);
+    omt->verify_marks_consistent();
+}
+
+static void *stress_delete_worker(void *extrav) {
+    reader_extra *CAST_FROM_VOIDP(readers, extrav);
+    stress_shared &shared = *readers[0].shared;
+    int num_marker_threads = shared.num_marker_threads;
+    toku_mutex_t &mutex = shared.mutex;
+    const double repetitions = 20;
+    for (int i = 0; i < repetitions; ++i) {
+        // sleep 0 - 0.15s
+        // early iterations sleep for a short time
+        // later iterations sleep longer
+        int sleep_for = 1000 * 100 * (1.5 * (i+1) / repetitions);
+        usleep(sleep_for);
+
+        toku_mutex_lock(&mutex);
+        rwlock_write_lock(&shared.lock, &mutex);
+        toku_mutex_unlock(&mutex);
+
+        stress_deleter(readers, num_marker_threads, shared.omt);
+
+        toku_mutex_lock(&mutex);
+        rwlock_write_unlock(&shared.lock);
+        toku_mutex_unlock(&mutex);
+    }
+    toku_sync_bool_compare_and_swap(&shared.running, true, false);
+    return nullptr;
+}
+
+static void stress_test(int nelts) {
+    stress_omt omt;
+    omt.create();
+    for (int i = 0; i < nelts; ++i) {
+        omt.insert_at(fudge(i), i);
+    }
+
+    const int num_marker_threads = 5;
+    struct stress_shared extra;
+    ZERO_STRUCT(extra);
+    extra.omt = &omt;
+    toku_mutex_init(toku_uninstrumented, &extra.mutex, nullptr);
+    rwlock_init(toku_uninstrumented, &extra.lock);
+    extra.running = true;
+    extra.num_marker_threads = num_marker_threads;
+
+    struct reader_extra readers[num_marker_threads];
+    ZERO_ARRAY(readers);
+
+    srandom(time(NULL));
+    toku_pthread_t marker_threads[num_marker_threads];
+    for (int i = 0; i < num_marker_threads; ++i) {
+        struct reader_extra &reader = readers[i];
+        reader.tid = i;
+        reader.shared = &extra;
+
+        int r;
+        int seed = random();
+        r = myinitstate_r(seed, reader.buf_read, 8, &reader.rand_read);
+        invariant_zero(r);
+        r = myinitstate_r(seed, reader.buf_write, 8, &reader.rand_write);
+        invariant_zero(r);
+
+        toku_pthread_create(toku_uninstrumented,
+                            &marker_threads[i],
+                            nullptr,
+                            stress_mark_worker,
+                            &reader);
+    }
+
+    toku_pthread_t deleter_thread;
+    toku_pthread_create(toku_uninstrumented,
+                        &deleter_thread,
+                        nullptr,
+                        stress_delete_worker,
+                        &readers[0]);
+    toku_pthread_join(deleter_thread, NULL);
+
+    for (int i = 0; i < num_marker_threads; ++i) {
+        toku_pthread_join(marker_threads[i], NULL);
+    }
+
+    rwlock_destroy(&extra.lock);
+    toku_mutex_destroy(&extra.mutex);
+
+    omt.destroy();
+}
+
+} // end namespace test
+
+} // end namespace toku
+
+int test_main(int argc, const char *argv[]) {
+    default_parse_args(argc, argv);
+
+    for (int i = 10; i <= 80; i*=2) {
+        toku::test::run_test(i);
+    }
+
+    toku::test::run_test(9000);
+
+    toku::test::stress_test(1000 * 100);
+
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/memarena-test.cc b/storage/tokudb/PerconaFT/util/tests/memarena-test.cc
new file mode 100644
index 00000000..94838506
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/memarena-test.cc
@@ -0,0 +1,184 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <string.h>
+
+#include "portability/toku_assert.h"
+
+#include "util/memarena.h"
+
+class memarena_unit_test {
+private:
+    static const int magic = 37;
+
+    template <typename F>
+    void iterate_chunks(memarena *ma, F &fn) {
+        for (memarena::chunk_iterator it(ma); it.more(); it.next()) {
+            size_t used = 0;
+            const void *buf = it.current(&used);
+            fn(buf, used);
+        }
+    }
+
+    void test_create(size_t size) {
+        memarena ma;
+        ma.create(size);
+        invariant(ma._current_chunk.size == size);
+        invariant(ma._current_chunk.used == 0);
+        if (size == 0) {
+            invariant_null(ma._current_chunk.buf);
+        } else {
+            invariant_notnull(ma._current_chunk.buf);
+        }
+
+        // make sure memory was allocated ok by
+        // writing to buf and reading it back
+        if (size > 0) {
+            memset(ma._current_chunk.buf, magic, size);
+        }
+        for (size_t i = 0; i < size; i++) {
+            const char *buf = reinterpret_cast<char *>(ma._current_chunk.buf);
+            invariant(buf[i] == magic);
+        }
+        ma.destroy();
+    }
+
+    void test_malloc(size_t size) {
+        memarena ma;
+        ma.create(14);
+        void *v = ma.malloc_from_arena(size);
+        invariant_notnull(v);
+
+        // make sure memory was allocated ok by
+        // writing to buf and reading it back
+        if (size > 0) {
+            memset(ma._current_chunk.buf, magic, size);
+        }
+        for (size_t i = 0; i < size; i++) {
+            const char *c = reinterpret_cast<char *>(ma._current_chunk.buf);
+            invariant(c[i] == magic);
+        }
+        ma.destroy();
+    }
+
+    static void test_iterate_fn(const void *buf, size_t used) {
+        for (size_t i = 0; i < used; i++) {
+            const char *c = reinterpret_cast<const char *>(buf);
+            invariant(c[i] == (char) ((intptr_t) &c[i]));
+        }
+    }
+
+    void test_iterate(size_t size) {
+        memarena ma;
+        ma.create(14);
+        for (size_t k = 0; k < size / 64; k += 64) {
+            void *v = ma.malloc_from_arena(64);
+            for (size_t i = 0; i < 64; i++) {
+                char *c = reinterpret_cast<char *>(v);
+                c[i] = (char) ((intptr_t) &c[i]);
+            }
+        }
+        size_t rest = size % 64;
+        if (rest != 0) {
+            void *v = ma.malloc_from_arena(64);
+            for (size_t i = 0; i < 64; i++) {
+                char *c = reinterpret_cast<char *>(v);
+                c[i] = (char) ((intptr_t) &c[i]);
+            }
+        }
+
+        iterate_chunks(&ma, test_iterate_fn);
+        ma.destroy();
+    }
+
+    void test_move_memory(size_t size) {
+        memarena ma;
+        ma.create(14);
+        for (size_t k = 0; k < size / 64; k += 64) {
+            void *v = ma.malloc_from_arena(64);
+            for (size_t i = 0; i < 64; i++) {
+                char *c = reinterpret_cast<char *>(v);
+                c[i] = (char) ((intptr_t) &c[i]);
+            }
+        }
+        size_t rest = size % 64;
+        if (rest != 0) {
+            void *v = ma.malloc_from_arena(64);
+            for (size_t i = 0; i < 64; i++) {
+                char *c = reinterpret_cast<char *>(v);
+                c[i] = (char) ((intptr_t) &c[i]);
+            }
+        }
+
+        memarena ma2;
+        ma.move_memory(&ma2);
+        iterate_chunks(&ma2, test_iterate_fn);
+
+        ma.destroy();
+        ma2.destroy();
+    }
+
+public:
+    void test() {
+        test_create(0);
+        test_create(64);
+        test_create(128 * 1024 * 1024);
+        test_malloc(0);
+        test_malloc(63);
+        test_malloc(64);
+        test_malloc(64 * 1024 * 1024);
+        test_malloc((64 * 1024 * 1024) + 1);
+        test_iterate(0);
+        test_iterate(63);
+        test_iterate(128 * 1024);
+        test_iterate(64 * 1024 * 1024);
+        test_iterate((64 * 1024 * 1024) + 1);
+        test_move_memory(0);
+        test_move_memory(1);
+        test_move_memory(63);
+        test_move_memory(65);
+        test_move_memory(65 * 1024 * 1024);
+        test_move_memory(101 * 1024 * 1024);
+    }
+};
+
+int main(void) {
+    memarena_unit_test test;
+    test.test();
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/minicron-change-period-data-race.cc b/storage/tokudb/PerconaFT/util/tests/minicron-change-period-data-race.cc
new file mode 100644
index 00000000..952cbf57
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/minicron-change-period-data-race.cc
@@ -0,0 +1,66 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2018, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2018, Percona and/or its affiliates. All rights reserved."
+
+#include <toku_portability.h>
+#include "test.h"
+#include "util/minicron.h"
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+
+// The thread sanitizer detected a data race in the minicron in a test unrelated to the minicron.
+// This test reproduces the data race in a much smaller test which merely runs minicron tasks
+// while changing the minicron period in an unrelated thread.
+
+static int do_nothing(void *UU(v)) {
+    return 0;
+}
+
+int test_main (int argc, const char *argv[]) {
+    default_parse_args(argc,argv);
+
+    minicron m = {};
+    int r = toku_minicron_setup(&m, 1, do_nothing, nullptr);
+    assert(r == 0);
+    for (int i=0; i<1000; i++) 
+        toku_minicron_change_period(&m, 1);
+    r = toku_minicron_shutdown(&m);
+    assert(r == 0);
+
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/minicron-test.cc b/storage/tokudb/PerconaFT/util/tests/minicron-test.cc
new file mode 100644
index 00000000..026ab744
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/minicron-test.cc
@@ -0,0 +1,221 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <toku_portability.h>
+#include "test.h"
+#include "util/minicron.h"
+#include <unistd.h>
+
+#include <string.h>
+#include <stdlib.h>
+
+static double
+tdiff (struct timeval *a, struct timeval *b) {
+    return (a->tv_sec-b->tv_sec) + (a->tv_usec-b->tv_usec)*1e-6;
+}
+
+struct timeval starttime;
+static double elapsed (void) {
+    struct timeval now;
+    gettimeofday(&now, 0);
+    return tdiff(&now, &starttime);
+}
+
+static int 
+#ifndef GCOV
+__attribute__((__noreturn__))
+#endif
+never_run (void *a) {
+    assert(a==0);
+    assert(0);
+#if defined(GCOV)
+    return 0;
+#endif
+}
+
+// Can we start something with period=0 (the function should never run) and shut it down.
+static void*
+test1 (void* v)
+{
+    struct minicron m;
+    memset(&m, 0, sizeof(struct minicron));
+    int r = toku_minicron_setup(&m, 0, never_run, 0);   assert(r==0);
+    sleep(1);
+    r = toku_minicron_shutdown(&m);                     assert(r==0);
+    return v;
+}
+
+// Can we start something with period=10 and shut it down after 2 seconds (the function should never run) .
+static void*
+test2 (void* v)
+{
+    struct minicron m;
+    memset(&m, 0, sizeof(struct minicron));
+    int r = toku_minicron_setup(&m, 10000, never_run, 0);   assert(r==0);
+    sleep(2);
+    r = toku_minicron_shutdown(&m);                     assert(r==0);
+    return v;
+}
+
+struct tenx {
+    struct timeval tv;
+    int counter;
+};
+
+static int
+run_5x (void *v) {
+    struct tenx *CAST_FROM_VOIDP(tx, v);
+    struct timeval now;
+    gettimeofday(&now, 0);
+    double diff = tdiff(&now, &tx->tv);
+    if (verbose) printf("T=%f tx->counter=%d\n", diff, tx->counter);
+    // We only verify that the timer was not premature.  
+    // Sometimes it will be delayed, but there's no good way to test it and nothing we can do about it.
+    if (!(diff>0.5 + tx->counter)) {
+      printf("T=%f tx->counter=%d\n", diff, tx->counter);
+      assert(0);
+    }
+    tx->counter++;
+    return 0;
+}
+
+// Start something with period=1 and run it a few times
+static void*
+test3 (void* v)
+{
+    struct minicron m;
+    struct tenx tx;
+    gettimeofday(&tx.tv, 0);
+    tx.counter=0;
+    memset(&m, 0, sizeof(struct minicron));
+    int r = toku_minicron_setup(&m, 1000, run_5x, &tx);   assert(r==0);
+    sleep(5);
+    r = toku_minicron_shutdown(&m);                     assert(r==0);
+    assert(tx.counter>=4 && tx.counter<=5); // after 5 seconds it could have run 4 or 5 times.
+    return v;
+}
+
+static int
+run_3sec (void *v) {
+    if (verbose) printf("start3sec at %.6f\n", elapsed());
+    int *CAST_FROM_VOIDP(counter, v);
+    (*counter)++;
+    sleep(3);
+    if (verbose) printf("end3sec at %.6f\n", elapsed());
+    return 0;
+}
+
+// make sure that if f is really slow that it doesn't run too many times
+static void*
+test4 (void *v) {
+    struct minicron m;
+    int counter = 0;
+    memset(&m, 0, sizeof(struct minicron));
+    int r = toku_minicron_setup(&m, 2000, run_3sec, &counter); assert(r==0);
+    sleep(10);
+    r = toku_minicron_shutdown(&m);                     assert(r==0);
+    assert(counter==3);
+    return v;
+}
+
+static void*
+test5 (void *v) {
+    struct minicron m;
+    int counter = 0;
+    memset(&m, 0, sizeof(struct minicron));
+    int r = toku_minicron_setup(&m, 10000, run_3sec, &counter); assert(r==0);
+    toku_minicron_change_period(&m, 2000);
+    sleep(10);
+    r = toku_minicron_shutdown(&m);                     assert(r==0);
+    assert(counter==3);
+    return v;
+}
+
+static void*
+test6 (void *v) {
+    struct minicron m;
+    memset(&m, 0, sizeof(struct minicron));
+    int r = toku_minicron_setup(&m, 5000, never_run, 0); assert(r==0);
+    toku_minicron_change_period(&m, 0);
+    sleep(7);
+    r = toku_minicron_shutdown(&m);                          assert(r==0);
+    return v;
+}
+
+// test that we actually run once per period, even if the execution is long
+static void*
+test7 (void *v) {
+    struct minicron m;
+    memset(&m, 0, sizeof(struct minicron));
+    int counter = 0;
+    int r = toku_minicron_setup(&m, 5000, run_3sec, &counter); assert(r==0);
+    sleep(17);
+    r = toku_minicron_shutdown(&m);                     assert(r==0);
+    assert(counter==3);
+    return v;
+}
+
+typedef void*(*ptf)(void*);
+int
+test_main (int argc, const char *argv[]) {
+    default_parse_args(argc,argv);
+    gettimeofday(&starttime, 0);
+
+    ptf testfuns[] = {test1, test2, test3,
+                      test4,
+                      test5,
+                      test6,
+                      test7
+    };
+#define N (sizeof(testfuns)/sizeof(testfuns[0]))
+    toku_pthread_t tests[N];
+
+    unsigned int i;
+    for (i = 0; i < N; i++) {
+        int r = toku_pthread_create(
+            toku_uninstrumented, tests + i, nullptr, testfuns[i], nullptr);
+        assert(r == 0);
+    }
+    for (i = 0; i < N; i++) {
+        void *v;
+        int r=toku_pthread_join(tests[i], &v);
+        assert(r==0);
+        assert(v==0);
+    }
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/omt-test.cc b/storage/tokudb/PerconaFT/util/tests/omt-test.cc
new file mode 100644
index 00000000..0d2c08f5
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/omt-test.cc
@@ -0,0 +1,898 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "test.h"
+
+#include <util/omt.h>
+
+static void
+parse_args (int argc, const char *argv[]) {
+    const char *argv0=argv[0];
+    while (argc>1) {
+        int resultcode=0;
+        if (strcmp(argv[1], "-v")==0) {
+            verbose++;
+        } else if (strcmp(argv[1], "-q")==0) {
+            verbose = 0;
+        } else if (strcmp(argv[1], "-h")==0) {
+        do_usage:
+            fprintf(stderr, "Usage:\n%s [-v|-h]\n", argv0);
+            exit(resultcode);
+        } else {
+            resultcode=1;
+            goto do_usage;
+        }
+        argc--;
+        argv++;
+    }
+}
+/* End ".h like" stuff. */
+
+struct value {
+    uint32_t number;
+};
+#define V(x) ((struct value *)(x))
+
+enum rand_type {
+    TEST_RANDOM,
+    TEST_SORTED,
+    TEST_IDENTITY
+};
+enum close_when_done {
+    CLOSE_WHEN_DONE,
+    KEEP_WHEN_DONE
+};
+enum create_type {
+    STEAL_ARRAY,
+    BATCH_INSERT,
+    INSERT_AT,
+    INSERT_AT_ALMOST_RANDOM,
+};
+
+/* Globals */
+typedef void *OMTVALUE;
+toku::omt<OMTVALUE> *global_omt;
+OMTVALUE*       global_values = NULL;
+struct value*   global_nums   = NULL;
+uint32_t       global_length;
+
+static void
+cleanup_globals (void) {
+    assert(global_values);
+    toku_free(global_values);
+    global_values = NULL;
+    assert(global_nums);
+    toku_free(global_nums);
+    global_nums = NULL;
+}
+
+/* Some test wrappers */
+struct functor {
+    int (*f)(OMTVALUE, uint32_t, void *);
+    void *v;
+};
+int call_functor(const OMTVALUE &v, uint32_t idx, functor *const ftor);
+int call_functor(const OMTVALUE &v, uint32_t idx, functor *const ftor) {
+    return ftor->f(const_cast<OMTVALUE>(v), idx, ftor->v);
+}
+static int omt_iterate(toku::omt<void *> *omt, int (*f)(OMTVALUE, uint32_t, void*), void*v) {
+    struct functor ftor = { .f = f, .v = v };
+    return omt->iterate<functor, call_functor>(&ftor);
+}
+
+struct heftor {
+    int (*h)(OMTVALUE, void *v);
+    void *v;
+};
+int call_heftor(const OMTVALUE &v, const heftor &htor);
+int call_heftor(const OMTVALUE &v, const heftor &htor) {
+    return htor.h(const_cast<OMTVALUE>(v), htor.v);
+}
+static int omt_insert(toku::omt<void *> *omt, OMTVALUE value, int(*h)(OMTVALUE, void*v), void *v, uint32_t *index) {
+    struct heftor htor = { .h = h, .v = v };
+    return omt->insert<heftor, call_heftor>(value, htor, index);
+}
+static int omt_find_zero(toku::omt<void *> *V, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, uint32_t *index) {
+    struct heftor htor = { .h = h, .v = extra };
+    return V->find_zero<heftor, call_heftor>(htor, value, index);
+}
+static int omt_find(toku::omt<void *> *V, int (*h)(OMTVALUE, void*extra), void*extra, int direction, OMTVALUE *value, uint32_t *index) {
+    struct heftor htor = { .h = h, .v = extra };
+    return V->find<heftor, call_heftor>(htor, direction, value, index);
+}
+static int omt_split_at(toku::omt<void *> *omt, toku::omt<void *> **newomtp, uint32_t index) {
+    toku::omt<void *> *XMALLOC(newomt);
+    int r = omt->split_at(newomt, index);
+    if (r != 0) {
+        toku_free(newomt);
+    } else {
+        *newomtp = newomt;
+    }
+    return r;
+}
+static int omt_merge(toku::omt<void *> *leftomt, toku::omt<void *> *rightomt, toku::omt<void *> **newomtp) {
+    toku::omt<void *> *XMALLOC(newomt);
+    newomt->merge(leftomt, rightomt);
+    toku_free(leftomt);
+    toku_free(rightomt);
+    *newomtp = newomt;
+    return 0;
+}
+
+const unsigned int random_seed = 0xFEADACBA;
+
+static void
+init_init_values (unsigned int seed, uint32_t num_elements) {
+    srandom(seed);
+
+    cleanup_globals();
+
+    XMALLOC_N(num_elements, global_values);
+    XMALLOC_N(num_elements, global_nums);
+    global_length = num_elements;
+}
+
+static void
+init_identity_values (unsigned int seed, uint32_t num_elements) {
+    uint32_t   i;
+
+    init_init_values(seed, num_elements);
+
+    for (i = 0; i < global_length; i++) {
+        global_nums[i].number   = i;
+        global_values[i]        = (OMTVALUE)&global_nums[i];
+    }
+}
+
+static void
+init_distinct_sorted_values (unsigned int seed, uint32_t num_elements) {
+    uint32_t   i;
+
+    init_init_values(seed, num_elements);
+
+    uint32_t number = 0;
+
+    for (i = 0; i < global_length; i++) {
+        number          += (uint32_t)(random() % 32) + 1;
+        global_nums[i].number   = number;
+        global_values[i]        = (OMTVALUE)&global_nums[i];
+    }
+}
+
+static void
+init_distinct_random_values (unsigned int seed, uint32_t num_elements) {
+    init_distinct_sorted_values(seed, num_elements);
+
+    uint32_t   i;
+    uint32_t   choice;
+    uint32_t   choices;
+    struct value temp;
+    for (i = 0; i < global_length - 1; i++) {
+        choices = global_length - i;
+        choice  = random() % choices;
+        if (choice != i) {
+            temp         = global_nums[i];
+            global_nums[i]      = global_nums[choice];
+            global_nums[choice] = temp;
+        }
+    }
+}
+
+static void
+init_globals (void) {
+    XMALLOC_N(1, global_values);
+    XMALLOC_N(1, global_nums);
+    global_length = 1;
+}
+
+static void
+test_close (enum close_when_done do_close) {
+    if (do_close == KEEP_WHEN_DONE) {
+        return;
+    }
+    assert(do_close == CLOSE_WHEN_DONE);
+    global_omt->destroy();
+    toku_free(global_omt);
+}
+
+static void
+test_create (enum close_when_done do_close) {
+    XMALLOC(global_omt);
+    global_omt->create();
+    test_close(do_close);
+}
+
+static void
+test_create_size (enum close_when_done do_close) {
+    test_create(KEEP_WHEN_DONE);
+    assert(global_omt->size() == 0);
+    test_close(do_close);
+}
+
+static void
+test_create_insert_at_almost_random (enum close_when_done do_close) {
+    uint32_t i;
+    int r;
+    uint32_t size = 0;
+
+    test_create(KEEP_WHEN_DONE);
+    r = global_omt->insert_at(global_values[0], global_omt->size()+1);
+    CKERR2(r, EINVAL);
+    r = global_omt->insert_at(global_values[0], global_omt->size()+2);
+    CKERR2(r, EINVAL);
+    for (i = 0; i < global_length/2; i++) {
+        assert(size==global_omt->size());
+        r = global_omt->insert_at(global_values[i], i);
+        CKERR(r);
+        assert(++size==global_omt->size());
+        r = global_omt->insert_at(global_values[global_length-1-i], i+1);
+        CKERR(r);
+        assert(++size==global_omt->size());
+    }
+    r = global_omt->insert_at(global_values[0], global_omt->size()+1);
+    CKERR2(r, EINVAL);
+    r = global_omt->insert_at(global_values[0], global_omt->size()+2);
+    CKERR2(r, EINVAL);
+    assert(size==global_omt->size());
+    test_close(do_close);
+}
+
+static void
+test_create_insert_at_sequential (enum close_when_done do_close) {
+    uint32_t i;
+    int r;
+    uint32_t size = 0;
+
+    test_create(KEEP_WHEN_DONE);
+    r = global_omt->insert_at(global_values[0], global_omt->size()+1);
+    CKERR2(r, EINVAL);
+    r = global_omt->insert_at(global_values[0], global_omt->size()+2);
+    CKERR2(r, EINVAL);
+    for (i = 0; i < global_length; i++) {
+        assert(size==global_omt->size());
+        r = global_omt->insert_at(global_values[i], i);
+        CKERR(r);
+        assert(++size==global_omt->size());
+    }
+    r = global_omt->insert_at(global_values[0], global_omt->size()+1);
+    CKERR2(r, EINVAL);
+    r = global_omt->insert_at(global_values[0], global_omt->size()+2);
+    CKERR2(r, EINVAL);
+    assert(size==global_omt->size());
+    test_close(do_close);
+}
+
+static void
+test_create_from_sorted_array (enum create_type create_choice, enum close_when_done do_close) {
+    global_omt = NULL;
+
+    if (create_choice == BATCH_INSERT) {
+        XMALLOC(global_omt);
+        global_omt->create_from_sorted_array(global_values, global_length);
+    }
+    else if (create_choice == STEAL_ARRAY) {
+        XMALLOC(global_omt);
+        OMTVALUE* XMALLOC_N(global_length, values_copy);
+        memcpy(values_copy, global_values, global_length*sizeof(*global_values));
+        global_omt->create_steal_sorted_array(&values_copy, global_length, global_length);
+        assert(values_copy==NULL);
+    }
+    else if (create_choice == INSERT_AT) {
+        test_create_insert_at_sequential(KEEP_WHEN_DONE);
+    }
+    else if (create_choice == INSERT_AT_ALMOST_RANDOM) {
+        test_create_insert_at_almost_random(KEEP_WHEN_DONE);
+    }
+    else {
+        assert(false);
+    }
+
+    assert(global_omt!=NULL);
+    test_close(do_close);
+}
+
+static void
+test_create_from_sorted_array_size (enum create_type create_choice, enum close_when_done do_close) {
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    assert(global_omt->size()==global_length);
+    test_close(do_close);
+}    
+
+static void
+test_fetch_verify (toku::omt<void *> *omtree, OMTVALUE* val, uint32_t len ) {
+    uint32_t i;
+    int r;
+    OMTVALUE v = (OMTVALUE)&i;
+    OMTVALUE oldv = v;
+
+    assert(len == omtree->size());
+    for (i = 0; i < len; i++) {
+        assert(oldv!=val[i]);
+        v = NULL;
+        r = omtree->fetch(i, &v);
+        CKERR(r);
+        assert(v != NULL);
+        assert(v != oldv);
+        assert(v == val[i]);
+        assert(V(v)->number == V(val[i])->number);
+        v = oldv;
+    }
+
+    for (i = len; i < len*2; i++) {
+        v = oldv;
+        r = omtree->fetch(i, &v);
+        CKERR2(r, EINVAL);
+        assert(v == oldv);
+    }
+
+}
+
+static void
+test_create_fetch_verify (enum create_type create_choice, enum close_when_done do_close) {
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    test_fetch_verify(global_omt, global_values, global_length);
+    test_close(do_close);
+}
+
+static int iterate_helper_error_return = 1;
+
+static int
+iterate_helper (OMTVALUE v, uint32_t idx, void* extra) {
+    if (extra == NULL) return iterate_helper_error_return;
+    OMTVALUE* vals = (OMTVALUE *)extra;
+    assert(v != NULL);
+    assert(v == vals[idx]);
+    assert(V(v)->number == V(vals[idx])->number);
+    return 0;
+}
+
+static void
+test_iterate_verify (toku::omt<void *> *omtree, OMTVALUE* vals, uint32_t len) {
+    int r;
+    iterate_helper_error_return = 0;
+    r = omt_iterate(omtree, iterate_helper, (void*)vals);
+    CKERR(r);
+    iterate_helper_error_return = 0xFEEDABBA;
+    r = omt_iterate(omtree, iterate_helper, NULL);
+    if (!len) {
+        CKERR2(r, 0);
+    }
+    else {
+        CKERR2(r, iterate_helper_error_return);
+    }
+}
+
+static void
+test_create_iterate_verify (enum create_type create_choice, enum close_when_done do_close) {
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    test_iterate_verify(global_omt, global_values, global_length);
+    test_close(do_close);
+}
+
+
+static void
+permute_array (uint32_t* arr, uint32_t len) {
+    //
+    // create a permutation of 0...size-1
+    //
+    uint32_t i = 0;
+    for (i = 0; i < len; i++) {
+        arr[i] = i;
+    }
+    for (i = 0; i < len - 1; i++) {
+        uint32_t choices = len - i;
+        uint32_t choice  = random() % choices;
+        if (choice != i) {
+            uint32_t temp = arr[i];
+            arr[i]      = arr[choice];
+            arr[choice] = temp;
+        }
+    }
+}
+
+static void
+test_create_set_at (enum create_type create_choice, enum close_when_done do_close) {
+    uint32_t i = 0;
+
+    struct value*   old_nums   = NULL;
+    XMALLOC_N(global_length, old_nums);
+
+    uint32_t* perm = NULL;
+    XMALLOC_N(global_length, perm);
+
+    OMTVALUE* old_values = NULL;
+    XMALLOC_N(global_length, old_values);
+    
+    permute_array(perm, global_length);
+
+    //
+    // These are going to be the new global_values
+    //
+    for (i = 0; i < global_length; i++) {
+        old_nums[i] = global_nums[i];
+        old_values[i] = &old_nums[i];        
+        global_values[i] = &old_nums[i];
+    }
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    int r;
+    r = global_omt->set_at(global_values[0], global_length);
+    CKERR2(r,EINVAL);    
+    r = global_omt->set_at(global_values[0], global_length+1);
+    CKERR2(r,EINVAL);    
+    for (i = 0; i < global_length; i++) {
+        uint32_t choice = perm[i];
+        global_values[choice] = &global_nums[choice];
+        global_nums[choice].number = (uint32_t)random();
+        r = global_omt->set_at(global_values[choice], choice);
+        CKERR(r);
+        test_iterate_verify(global_omt, global_values, global_length);
+        test_fetch_verify(global_omt, global_values, global_length);
+    }
+    r = global_omt->set_at(global_values[0], global_length);
+    CKERR2(r,EINVAL);    
+    r = global_omt->set_at(global_values[0], global_length+1);
+    CKERR2(r,EINVAL);    
+
+    toku_free(perm);
+    toku_free(old_values);
+    toku_free(old_nums);
+
+    test_close(do_close);
+}
+
+static int
+insert_helper (OMTVALUE value, void* extra_insert) {
+    OMTVALUE to_insert = (OMTVALUE)extra_insert;
+    assert(to_insert);
+
+    if (V(value)->number < V(to_insert)->number) return -1;
+    if (V(value)->number > V(to_insert)->number) return +1;
+    return 0;
+}
+
+static void
+test_create_insert (enum close_when_done do_close) {
+    uint32_t i = 0;
+
+    uint32_t* perm = NULL;
+    XMALLOC_N(global_length, perm);
+
+    permute_array(perm, global_length);
+
+    test_create(KEEP_WHEN_DONE);
+    int r;
+    uint32_t size = global_length;
+    global_length = 0;
+    while (global_length < size) {
+        uint32_t choice = perm[global_length];
+        OMTVALUE to_insert = &global_nums[choice];
+        uint32_t idx = UINT32_MAX;
+
+        assert(global_length==global_omt->size());
+        r = omt_insert(global_omt, to_insert, insert_helper, to_insert, &idx);
+        CKERR(r);
+        assert(idx <= global_length);
+        if (idx > 0) {
+            assert(V(to_insert)->number > V(global_values[idx-1])->number);
+        }
+        if (idx < global_length) {
+            assert(V(to_insert)->number < V(global_values[idx])->number);
+        }
+        global_length++;
+        assert(global_length==global_omt->size());
+        /* Make room */
+        for (i = global_length-1; i > idx; i--) {
+            global_values[i] = global_values[i-1];
+        }
+        global_values[idx] = to_insert;
+        test_fetch_verify(global_omt, global_values, global_length);
+        test_iterate_verify(global_omt, global_values, global_length);
+
+        idx = UINT32_MAX;
+        r = omt_insert(global_omt, to_insert, insert_helper, to_insert, &idx);
+        CKERR2(r, DB_KEYEXIST);
+        assert(idx < global_length);
+        assert(V(global_values[idx])->number == V(to_insert)->number);
+        assert(global_length==global_omt->size());
+
+        test_iterate_verify(global_omt, global_values, global_length);
+        test_fetch_verify(global_omt, global_values, global_length);
+    }
+
+    toku_free(perm);
+
+    test_close(do_close);
+}
+
+static void
+test_create_delete_at (enum create_type create_choice, enum close_when_done do_close) {
+    uint32_t i = 0;
+    int r = ENOSYS;
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+
+    assert(global_length == global_omt->size());
+    r = global_omt->delete_at(global_length);
+    CKERR2(r,EINVAL);
+    assert(global_length == global_omt->size());
+    r = global_omt->delete_at(global_length+1);
+    CKERR2(r,EINVAL);
+    while (global_length > 0) {
+        assert(global_length == global_omt->size());
+        uint32_t index_to_delete = random()%global_length;
+        r = global_omt->delete_at(index_to_delete);
+        CKERR(r);
+        for (i = index_to_delete+1; i < global_length; i++) {
+            global_values[i-1] = global_values[i];
+        }
+        global_length--;
+        test_fetch_verify(global_omt, global_values, global_length);
+        test_iterate_verify(global_omt, global_values, global_length);
+    }
+    assert(global_length == 0);
+    assert(global_length == global_omt->size());
+    r = global_omt->delete_at(global_length);
+    CKERR2(r, EINVAL);
+    assert(global_length == global_omt->size());
+    r = global_omt->delete_at(global_length+1);
+    CKERR2(r, EINVAL);
+    test_close(do_close);
+}
+
+static void
+test_split_merge (enum create_type create_choice, enum close_when_done do_close) {
+    int r = ENOSYS;
+    uint32_t i = 0;
+    toku::omt<void *> *left_split = NULL;
+    toku::omt<void *> *right_split = NULL;
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+
+    for (i = 0; i <= global_length; i++) {
+        r = omt_split_at(global_omt, &right_split, global_length+1);
+        CKERR2(r,EINVAL);
+        r = omt_split_at(global_omt, &right_split, global_length+2);
+        CKERR2(r,EINVAL);
+
+        //
+        // test successful split
+        //
+        r = omt_split_at(global_omt, &right_split, i);
+        CKERR(r);
+        left_split = global_omt;
+        global_omt = NULL;
+        assert(left_split->size() == i);
+        assert(right_split->size() == global_length - i);
+        test_fetch_verify(left_split, global_values, i);
+        test_iterate_verify(left_split, global_values, i);
+        test_fetch_verify(right_split, &global_values[i], global_length - i);
+        test_iterate_verify(right_split, &global_values[i], global_length - i);
+        //
+        // verify that new global_omt's cannot do bad splits
+        //
+        r = omt_split_at(left_split, &global_omt, i+1);
+        CKERR2(r,EINVAL);
+        assert(left_split->size() == i);
+        assert(right_split->size() == global_length - i);
+        r = omt_split_at(left_split, &global_omt, i+2);
+        CKERR2(r,EINVAL);
+        assert(left_split->size() == i);
+        assert(right_split->size() == global_length - i);
+        r = omt_split_at(right_split, &global_omt, global_length - i + 1);
+        CKERR2(r,EINVAL);
+        assert(left_split->size() == i);
+        assert(right_split->size() == global_length - i);
+        r = omt_split_at(right_split, &global_omt, global_length - i + 1);
+        CKERR2(r,EINVAL);
+        assert(left_split->size() == i);
+        assert(right_split->size() == global_length - i);
+
+        //
+        // test merge
+        //
+        r = omt_merge(left_split,right_split,&global_omt);
+        CKERR(r);
+        left_split = NULL;
+        right_split = NULL;
+        assert(global_omt->size() == global_length);
+        test_fetch_verify(global_omt, global_values, global_length);
+        test_iterate_verify(global_omt, global_values, global_length);
+    }
+    test_close(do_close);
+}
+
+
+static void
+init_values (enum rand_type rand_choice) {
+    const uint32_t test_size = 100;
+    if (rand_choice == TEST_RANDOM) {
+        init_distinct_random_values(random_seed, test_size);
+    }
+    else if (rand_choice == TEST_SORTED) {
+        init_distinct_sorted_values(random_seed, test_size);
+    }
+    else if (rand_choice == TEST_IDENTITY) {
+        init_identity_values(       random_seed, test_size);
+    }
+    else assert(false);
+}
+
+static void
+test_create_array (enum create_type create_choice, enum rand_type rand_choice) {
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_from_sorted_array(     create_choice, CLOSE_WHEN_DONE);
+    test_create_from_sorted_array_size(create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_fetch_verify(          create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_iterate_verify(        create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_set_at(                create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_delete_at(             create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_insert(                               CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_split_merge(                  create_choice, CLOSE_WHEN_DONE);
+}
+
+typedef struct {
+    uint32_t first_zero;
+    uint32_t first_pos;
+} h_extra;
+
+
+static int
+test_heaviside (OMTVALUE v_omt, void* x) {
+    OMTVALUE v = (OMTVALUE) v_omt;
+    h_extra* extra = (h_extra*)x;
+    assert(v && x);
+    assert(extra->first_zero <= extra->first_pos);
+
+    uint32_t value = V(v)->number;
+    if (value < extra->first_zero) return -1;
+    if (value < extra->first_pos) return 0;
+    return 1;
+}
+
+static void
+heavy_extra (h_extra* extra, uint32_t first_zero, uint32_t first_pos) {
+    extra->first_zero = first_zero;
+    extra->first_pos  = first_pos;
+}
+
+static void
+test_find_dir (int dir, void* extra, int (*h)(OMTVALUE, void*),
+	       int r_expect, bool idx_will_change, uint32_t idx_expect,
+	       uint32_t number_expect, bool UU(cursor_valid)) {
+    uint32_t idx     = UINT32_MAX;
+    uint32_t old_idx = idx;
+    OMTVALUE omt_val;
+    int r;
+
+    omt_val = NULL;
+
+    /* Verify we can pass NULL value. */
+    omt_val = NULL;
+    idx      = old_idx;
+    if (dir == 0) {
+        r = omt_find_zero(global_omt, h, extra,      NULL, &idx);
+    }
+    else {
+        r = omt_find(     global_omt, h, extra, dir, NULL, &idx);
+    }
+    CKERR2(r, r_expect);
+    if (idx_will_change) {
+        assert(idx == idx_expect);
+    }
+    else {
+        assert(idx == old_idx);
+    }
+    assert(omt_val == NULL);
+    
+    /* Verify we can pass NULL idx. */
+    omt_val  = NULL;
+    idx      = old_idx;
+    if (dir == 0) {
+        r = omt_find_zero(global_omt, h, extra,      &omt_val, 0);
+    }
+    else {
+        r = omt_find(     global_omt, h, extra, dir, &omt_val, 0);
+    }
+    CKERR2(r, r_expect);
+    assert(idx == old_idx);
+    if (r == DB_NOTFOUND) {
+        assert(omt_val == NULL);
+    }
+    else {
+        assert(V(omt_val)->number == number_expect);
+    }
+
+    /* Verify we can pass NULL both. */
+    omt_val  = NULL;
+    idx      = old_idx;
+    if (dir == 0) {
+        r = omt_find_zero(global_omt, h, extra,      NULL, 0);
+    }
+    else {
+        r = omt_find(     global_omt, h, extra, dir, NULL, 0);
+    }
+    CKERR2(r, r_expect);
+    assert(idx == old_idx);
+    assert(omt_val == NULL);
+}
+
+static void
+test_find (enum create_type create_choice, enum close_when_done do_close) {
+    h_extra extra;
+    init_identity_values(random_seed, 100);
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+
+/*
+    -...-
+        A
+*/
+    heavy_extra(&extra, global_length, global_length);
+    test_find_dir(-1, &extra, test_heaviside, 0,           true,  global_length-1, global_length-1, true);
+    test_find_dir(+1, &extra, test_heaviside, DB_NOTFOUND, false, 0,        0,        false);
+    test_find_dir(0,  &extra, test_heaviside, DB_NOTFOUND, true,  global_length,   global_length,   false);
+
+
+/*
+    +...+
+    B
+*/
+    heavy_extra(&extra, 0, 0);
+    test_find_dir(-1, &extra, test_heaviside, DB_NOTFOUND, false, 0, 0, false);
+    test_find_dir(+1, &extra, test_heaviside, 0,           true,  0, 0, true);
+    test_find_dir(0,  &extra, test_heaviside, DB_NOTFOUND, true,  0, 0, false);
+
+/*
+    0...0
+    C
+*/
+    heavy_extra(&extra, 0, global_length);
+    test_find_dir(-1, &extra, test_heaviside, DB_NOTFOUND, false, 0, 0, false);
+    test_find_dir(+1, &extra, test_heaviside, DB_NOTFOUND, false, 0, 0, false);
+    test_find_dir(0,  &extra, test_heaviside, 0,           true,  0, 0, true);
+
+/*
+    -...-0...0
+        AC
+*/
+    heavy_extra(&extra, global_length/2, global_length);
+    test_find_dir(-1, &extra, test_heaviside, 0,           true,  global_length/2-1, global_length/2-1, true);
+    test_find_dir(+1, &extra, test_heaviside, DB_NOTFOUND, false, 0,          0,          false);
+    test_find_dir(0,  &extra, test_heaviside, 0,           true,  global_length/2,   global_length/2,   true);
+
+/*
+    0...0+...+
+    C    B
+*/
+    heavy_extra(&extra, 0, global_length/2);
+    test_find_dir(-1, &extra, test_heaviside, DB_NOTFOUND, false, 0,        0,        false);
+    test_find_dir(+1, &extra, test_heaviside, 0,           true,  global_length/2, global_length/2, true);
+    test_find_dir(0,  &extra, test_heaviside, 0,           true,  0,        0,        true);
+
+/*
+    -...-+...+
+        AB
+*/
+    heavy_extra(&extra, global_length/2, global_length/2);
+    test_find_dir(-1, &extra, test_heaviside, 0,           true, global_length/2-1, global_length/2-1, true);
+    test_find_dir(+1, &extra, test_heaviside, 0,           true, global_length/2,   global_length/2,   true);
+    test_find_dir(0,  &extra, test_heaviside, DB_NOTFOUND, true, global_length/2,   global_length/2,   false);
+
+/*
+    -...-0...0+...+
+        AC    B
+*/    
+    heavy_extra(&extra, global_length/3, 2*global_length/3);
+    test_find_dir(-1, &extra, test_heaviside, 0, true,   global_length/3-1,   global_length/3-1, true);
+    test_find_dir(+1, &extra, test_heaviside, 0, true, 2*global_length/3,   2*global_length/3,   true);
+    test_find_dir(0,  &extra, test_heaviside, 0, true,   global_length/3,     global_length/3,   true);
+
+    /* Cleanup */
+    test_close(do_close);
+}
+
+static void
+runtests_create_choice (enum create_type create_choice) {
+    test_create_array(create_choice, TEST_SORTED);
+    test_create_array(create_choice, TEST_RANDOM);
+    test_create_array(create_choice, TEST_IDENTITY);
+    test_find(        create_choice, CLOSE_WHEN_DONE);
+}
+
+static void
+test_clone(uint32_t nelts)
+// Test that each clone operation gives the right data back.  If nelts is
+// zero, also tests that you still get a valid omt back and that the way
+// to deallocate it still works.
+{
+    toku::omt<void *> *src = NULL, *dest = NULL;
+    int r;
+
+    XMALLOC(src);
+    src->create();
+    for (long i = 0; i < nelts; ++i) {
+        r = src->insert_at((OMTVALUE) i, i);
+        assert_zero(r);
+    }
+
+    XMALLOC(dest);
+    dest->clone(*src);
+    assert(dest != NULL);
+    assert(dest->size() == nelts);
+    for (long i = 0; i < nelts; ++i) {
+        OMTVALUE v;
+        long l;
+        r = dest->fetch(i, &v);
+        assert_zero(r);
+        l = (long) v;
+        assert(l == i);
+    }
+    dest->destroy();
+    toku_free(dest);
+    src->destroy();
+    toku_free(src);
+}
+
+int
+test_main(int argc, const char *argv[]) {
+    parse_args(argc, argv);
+    init_globals();
+    test_create(      CLOSE_WHEN_DONE);
+    test_create_size( CLOSE_WHEN_DONE);
+    runtests_create_choice(BATCH_INSERT);
+    runtests_create_choice(STEAL_ARRAY);
+    runtests_create_choice(INSERT_AT);
+    runtests_create_choice(INSERT_AT_ALMOST_RANDOM);
+    test_clone(0);
+    test_clone(1);
+    test_clone(1000);
+    test_clone(10000);
+    cleanup_globals();
+    return 0;
+}
+
diff --git a/storage/tokudb/PerconaFT/util/tests/omt-tmpl-test.cc b/storage/tokudb/PerconaFT/util/tests/omt-tmpl-test.cc
new file mode 100644
index 00000000..8cfc875c
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/omt-tmpl-test.cc
@@ -0,0 +1,162 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <type_traits>
+#include <memory.h>
+#include <toku_portability.h>
+#include <toku_assert.h>
+#include <toku_time.h>
+#include <util/omt.h>
+
+namespace toku {
+
+namespace test {
+
+    inline int intcmp(const int &a, const int &b);
+    inline int intcmp(const int &a, const int &b) {
+        if (a < b) {
+            return -1;
+        }
+        if (a > b) {
+            return +1;
+        }
+        return 0;
+    }
+
+    typedef omt<int> int_omt_t;
+
+    static int intiter_magic = 0xdeadbeef;
+    inline int intiter(const int &value __attribute__((__unused__)), const uint32_t idx __attribute__((__unused__)), int *const extra);
+    inline int intiter(const int &value __attribute__((__unused__)), const uint32_t idx __attribute__((__unused__)), int *const extra) {
+        invariant(*extra == intiter_magic);
+        return 0;
+    }
+
+    struct intiter2extra {
+        int count;
+        int last;
+    };
+    inline int intiter2(const int &value, const uint32_t idx __attribute__((__unused__)), struct intiter2extra *const extra);
+    inline int intiter2(const int &value, const uint32_t idx __attribute__((__unused__)), struct intiter2extra *const extra) {
+        extra->count++;
+        invariant(extra->last < value);
+        extra->last = value;
+        return 0;
+    }
+
+    static void unittest(void) {
+        int_omt_t o;
+        int r;
+        o.create();
+        invariant(o.size() == 0);
+
+        r = o.insert<int, intcmp>(1, 1, nullptr);
+        invariant_zero(r);
+        r = o.insert<int, intcmp>(3, 3, nullptr);
+        invariant_zero(r);
+
+        invariant(o.size() == 2);
+
+        r = o.insert<int, intcmp>(2, 2, nullptr);
+        invariant_zero(r);
+
+        invariant(o.size() == 3);
+
+        int x;
+        r = o.fetch(1, &x);
+        invariant_zero(r);
+
+        invariant(x == 2);
+
+        r = o.iterate<int, intiter>(&intiter_magic);
+        invariant_zero(r);
+
+        struct intiter2extra e = {0, 0};
+        r = o.iterate_on_range<struct intiter2extra, intiter2>(0, 2, &e);
+        invariant_zero(r);
+        invariant(e.count == 2);
+        invariant(e.last == 2);
+
+        r = o.set_at(5, 1);
+        invariant_zero(r);
+        r = o.delete_at(1);
+        invariant_zero(r);
+
+        invariant(o.size() == 2);
+
+        o.destroy();
+
+        int *XMALLOC_N(4, intarray);
+        for (int i = 0; i < 4; ++i) {
+            intarray[i] = i + 1;
+        }
+        int_omt_t left, right;
+        left.create_steal_sorted_array(&intarray, 4, 4);
+        invariant_null(intarray);
+        right.create();
+        r = right.insert<int, intcmp>(8, 8, nullptr);
+        invariant_zero(r);
+        r = right.insert<int, intcmp>(7, 7, nullptr);
+        invariant_zero(r);
+        r = right.insert<int, intcmp>(6, 6, nullptr);
+        invariant_zero(r);
+        r = right.insert<int, intcmp>(5, 5, nullptr);
+        invariant_zero(r);
+
+        int_omt_t combined;
+        combined.merge(&left, &right);
+        invariant(combined.size() == 8);
+        invariant(left.size() == 0);
+        invariant(right.size() == 0);
+        struct intiter2extra e2 = {0, 0};
+        r = combined.iterate<struct intiter2extra, intiter2>(&e2);
+        invariant_zero(r);
+        invariant(e2.count == 8);
+        invariant(e2.last == 8);
+
+        combined.destroy();
+    }
+
+} // end namespace test
+
+} // end namespace toku
+
+int main(void) {
+    toku::test::unittest();
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/queue-test.cc b/storage/tokudb/PerconaFT/util/tests/queue-test.cc
new file mode 100644
index 00000000..f87e05bc
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/queue-test.cc
@@ -0,0 +1,136 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <toku_portability.h>
+#include "toku_os.h"
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <toku_assert.h>
+#include <toku_pthread.h>
+#include "util/queue.h"
+
+static int verbose=1;
+
+static int count_0 = 0;
+static uint64_t e_max_weight=0, d_max_weight = 0; // max weight seen by enqueue thread and dequeue thread respectively.
+
+static void *start_0 (void *arg) {
+    QUEUE q = (QUEUE)arg;
+    void *item;
+    uint64_t weight;
+    long count = 0;
+    while (1) {
+	uint64_t this_max_weight;
+	int r=toku_queue_deq(q, &item, &weight, &this_max_weight);
+	if (r==EOF) break;
+	assert(r==0);
+	if (this_max_weight>d_max_weight) d_max_weight=this_max_weight;
+	long v = (long)item;
+	//printf("D(%ld)=%ld %ld\n", v, this_max_weight, d_max_weight);
+	assert(v==count);
+	count_0++;
+	count++;
+    }
+    return NULL;
+}
+
+static void enq (QUEUE q, long v, uint64_t weight) {
+    uint64_t this_max_weight;
+    int r = toku_queue_enq(q, (void*)v, (weight==0)?0:1, &this_max_weight);
+    assert(r==0);
+    if (this_max_weight>e_max_weight) e_max_weight=this_max_weight;
+    //printf("E(%ld)=%ld %ld\n", v, this_max_weight, e_max_weight);
+}
+
+static void queue_test_0 (uint64_t weight)
+// Test a queue that can hold WEIGHT items.
+{
+    //printf("\n");
+    count_0 = 0;
+    e_max_weight = 0;
+    d_max_weight = 0;
+    QUEUE q;
+    int r;
+    r = toku_queue_create(&q, weight);
+    assert(r == 0);
+    toku_pthread_t thread;
+    r = toku_pthread_create(toku_uninstrumented, &thread, nullptr, start_0, q);
+    assert(r == 0);
+    enq(q, 0L, weight);
+    enq(q, 1L, weight);
+    enq(q, 2L, weight);
+    enq(q, 3L, weight);
+    sleep(1);
+    enq(q, 4L, weight);
+    enq(q, 5L, weight);
+    r = toku_queue_eof(q);                                      assert(r==0);
+    void *result;
+    r = toku_pthread_join(thread, &result);	           assert(r==0);
+    assert(result==NULL);
+    assert(count_0==6);
+    r = toku_queue_destroy(q);
+    assert(d_max_weight <= weight);
+    assert(e_max_weight <= weight);
+}
+
+
+static void parse_args (int argc, const char *argv[]) {
+    const char *progname=argv[0];
+    argc--; argv++;
+    while (argc>0) {
+	if (strcmp(argv[0],"-v")==0) {
+	    verbose++;
+	} else if (strcmp(argv[0],"-q")==0) {
+	    verbose--;
+	} else {
+	    fprintf(stderr, "Usage:\n %s [-v] [-q]\n", progname);
+	    exit(1);
+	}
+	argc--; argv++;
+    }
+    if (verbose<0) verbose=0;
+}
+
+int main (int argc, const char *argv[]) {
+    parse_args(argc, argv);
+    queue_test_0(0LL);
+    queue_test_0(1LL);
+    queue_test_0(2LL);
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/rwlock_condvar.h b/storage/tokudb/PerconaFT/util/tests/rwlock_condvar.h
new file mode 100644
index 00000000..b49c2780
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/rwlock_condvar.h
@@ -0,0 +1,149 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+/* Fair readers writer lock implemented using condition variables.
+ * This is maintained so that we can measure the performance of a relatively simple implementation (this one) 
+ * compared to a fast one that uses compare-and-swap (the one in ../toku_rwlock.c)
+ * For now it's only for testing.
+ */
+
+
+// Fair readers/writer locks.  These are fair (meaning first-come first-served.  No reader starvation, and no writer starvation).  And they are
+// probably faster than the linux readers/writer locks (pthread_rwlock_t).
+struct toku_cv_fair_rwlock_waiter_state; // this structure is used internally.
+typedef struct toku_cv_fair_rwlock_s {
+    toku_mutex_t                          mutex;
+    int                                   state; // 0 means no locks, + is number of readers locked, -1 is a writer
+    struct toku_cv_fair_rwlock_waiter_state *waiters_head, *waiters_tail;
+} toku_cv_fair_rwlock_t;
+
+void toku_cv_fair_rwlock_init (toku_cv_fair_rwlock_t *rwlock);
+void toku_cv_fair_rwlock_destroy (toku_cv_fair_rwlock_t *rwlock);
+int toku_cv_fair_rwlock_rdlock (toku_cv_fair_rwlock_t *rwlock);
+int toku_cv_fair_rwlock_wrlock (toku_cv_fair_rwlock_t *rwlock);
+int toku_cv_fair_rwlock_unlock (toku_cv_fair_rwlock_t *rwlock);
+
+struct toku_cv_fair_rwlock_waiter_state {
+    char is_read;
+    struct toku_cv_fair_rwlock_waiter_state *next;
+    toku_cond_t cond;
+};
+
+static __thread struct toku_cv_fair_rwlock_waiter_state waitstate = {0, NULL, {PTHREAD_COND_INITIALIZER} };
+
+void toku_cv_fair_rwlock_init (toku_cv_fair_rwlock_t *rwlock) {
+    rwlock->state = 0;
+    rwlock->waiters_head = NULL;
+    rwlock->waiters_tail = NULL;
+    toku_mutex_init(toku_uninstrumented, &rwlock->mutex, nullptr);
+}
+
+void toku_cv_fair_rwlock_destroy(toku_cv_fair_rwlock_t *rwlock) {
+    toku_mutex_destroy(&rwlock->mutex);
+}
+
+int toku_cv_fair_rwlock_rdlock (toku_cv_fair_rwlock_t *rwlock) {
+    toku_mutex_lock(&rwlock->mutex);
+    if (rwlock->waiters_head!=NULL || rwlock->state<0) {
+	// Someone is ahead of me in the queue, or someone has a lock.
+	// We use per-thread-state for the condition variable.  A thread cannot get control and try to reuse the waiter state for something else.
+	if (rwlock->waiters_tail) {
+	    rwlock->waiters_tail->next = &waitstate;
+	} else {
+	    rwlock->waiters_head = &waitstate;
+	}
+	rwlock->waiters_tail = &waitstate;
+	waitstate.next = NULL;
+	waitstate.is_read = 1; 
+	do {
+	    toku_cond_wait(&waitstate.cond, &rwlock->mutex);
+	} while (rwlock->waiters_head!=&waitstate || rwlock->state<0);
+	rwlock->state++;
+	rwlock->waiters_head=waitstate.next;
+	if (waitstate.next==NULL) rwlock->waiters_tail=NULL;
+	if (rwlock->waiters_head && rwlock->waiters_head->is_read) {
+	    toku_cond_signal(&rwlock->waiters_head->cond);
+	}
+    } else {
+	// No one is waiting, and any holders are readers.
+	rwlock->state++;
+    }
+    toku_mutex_unlock(&rwlock->mutex);
+    return 0;
+}
+
+int toku_cv_fair_rwlock_wrlock (toku_cv_fair_rwlock_t *rwlock) {
+    toku_mutex_lock(&rwlock->mutex);
+    if (rwlock->waiters_head!=NULL || rwlock->state!=0) {
+	// Someone else is ahead of me, or someone has a lock the lock, so we must wait our turn.
+	if (rwlock->waiters_tail) {
+	    rwlock->waiters_tail->next = &waitstate;
+	} else {
+	    rwlock->waiters_head = &waitstate;
+	}
+	rwlock->waiters_tail = &waitstate;
+	waitstate.next = NULL;
+	waitstate.is_read = 0;
+	do {
+	    toku_cond_wait(&waitstate.cond, &rwlock->mutex);
+	} while (rwlock->waiters_head!=&waitstate || rwlock->state!=0);
+	rwlock->waiters_head = waitstate.next;
+	if (waitstate.next==NULL) rwlock->waiters_tail=NULL;
+    }
+    rwlock->state = -1;
+    toku_mutex_unlock(&rwlock->mutex);
+    return 0;
+}
+
+int toku_cv_fair_rwlock_unlock (toku_cv_fair_rwlock_t *rwlock) {
+    toku_mutex_lock(&rwlock->mutex);
+    assert(rwlock->state!=0);
+    if (rwlock->state>0) {
+	rwlock->state--;
+    } else {
+	rwlock->state=0;
+    }
+    if (rwlock->state==0 && rwlock->waiters_head) {
+	toku_cond_signal(&rwlock->waiters_head->cond);
+    } else {
+	// printf(" No one to wake\n");
+    }
+    toku_mutex_unlock(&rwlock->mutex);
+    return 0;
+}
+
diff --git a/storage/tokudb/PerconaFT/util/tests/sm-basic.cc b/storage/tokudb/PerconaFT/util/tests/sm-basic.cc
new file mode 100644
index 00000000..0e5eb836
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/sm-basic.cc
@@ -0,0 +1,77 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+// test that basic scoped malloc works with a thread
+
+#include <toku_portability.h>
+#include <toku_assert.h>
+#include <toku_pthread.h>
+#include <util/scoped_malloc.h>
+
+static void sm_test(void) {
+    toku::scoped_malloc a(1);
+    {
+        toku::scoped_malloc b(2);
+        {
+            toku::scoped_malloc c(3);
+        }
+    }
+}
+
+static void *sm_test_f(void *arg) {
+    sm_test();
+    return arg;
+}
+
+int main(void) {
+    toku_scoped_malloc_init();
+
+    // run the test
+    toku_pthread_t tid;
+    int r;
+    r = toku_pthread_create(
+        toku_uninstrumented, &tid, nullptr, sm_test_f, nullptr);
+    assert_zero(r);
+    void *ret;
+    r = toku_pthread_join(tid, &ret);
+    assert_zero(r);
+
+    toku_scoped_malloc_destroy();
+
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/sm-crash-double-free.cc b/storage/tokudb/PerconaFT/util/tests/sm-crash-double-free.cc
new file mode 100644
index 00000000..5aa35655
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/sm-crash-double-free.cc
@@ -0,0 +1,79 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+// force a race between the scoped malloc global destructor and a thread variable destructor
+
+#define TOKU_SCOPED_MALLOC_DEBUG 1
+#include <toku_portability.h>
+#include <toku_assert.h>
+#include <toku_pthread.h>
+#include <toku_race_tools.h>
+#include <util/scoped_malloc.h>
+
+volatile int state = 0;
+
+static void sm_test(void) {
+    toku::scoped_malloc a(1);
+}
+
+static void *sm_test_f(void *arg) {
+    sm_test();
+    state = 1;
+    while (state != 2) sleep(1);
+    return arg;
+}
+
+int main(void) {
+    TOKU_VALGRIND_HG_DISABLE_CHECKING(&state, sizeof state);
+    state = 0;
+    toku_scoped_malloc_init();
+    toku_pthread_t tid;
+    int r;
+    r = toku_pthread_create(
+        toku_uninstrumented, &tid, nullptr, sm_test_f, nullptr);
+    assert_zero(r);
+    void *ret;
+    while (state != 1)
+        sleep(1);
+    toku_scoped_malloc_destroy_set();
+    state = 2;
+    r = toku_pthread_join(tid, &ret);
+    assert_zero(r);
+    toku_scoped_malloc_destroy_key();
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/sort-tmpl-test.cc b/storage/tokudb/PerconaFT/util/tests/sort-tmpl-test.cc
new file mode 100644
index 00000000..4db3b93d
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/sort-tmpl-test.cc
@@ -0,0 +1,179 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "test.h"
+
+#include <stdlib.h>
+
+#include <memory.h>
+#include <util/sort.h>
+
+const int MAX_NUM = 0x0fffffffL;
+int MAGIC_EXTRA = 0xd3adb00f;
+
+static int
+int_qsort_cmp(const void *va, const void *vb) {
+    const int *CAST_FROM_VOIDP(a, va);
+    const int *CAST_FROM_VOIDP(b, vb);
+    assert(*a < MAX_NUM);
+    assert(*b < MAX_NUM);
+    return (*a > *b) - (*a < *b);
+}
+
+int int_cmp(const int &e, const int &a, const int &b);
+int
+int_cmp(const int &e, const int &a, const int &b)
+{
+    assert(e == MAGIC_EXTRA);
+    return int_qsort_cmp(&a, &b);
+}
+
+static void
+check_int_array(int a[], int nelts)
+{
+    assert(a[0] < MAX_NUM);
+    for (int i = 1; i < nelts; ++i) {
+        assert(a[i] < MAX_NUM);
+        assert(a[i-1] <= a[i]);
+    }
+}
+
+static void
+zero_array_test(void)
+{
+    int unused = MAGIC_EXTRA - 1;
+    toku::sort<int, const int, int_cmp>::mergesort_r(NULL, 0, unused);
+}
+
+static void
+dup_array_test(int nelts)
+{
+    int *XMALLOC_N(nelts, a);
+    for (int i = 0; i < nelts; ++i) {
+        a[i] = 1;
+    }
+    toku::sort<int, const int, int_cmp>::mergesort_r(a, nelts, MAGIC_EXTRA);
+    check_int_array(a, nelts);
+    toku_free(a);
+}
+
+static void
+already_sorted_test(int nelts)
+{
+    int *XMALLOC_N(nelts, a);
+    for (int i = 0; i < nelts; ++i) {
+        a[i] = i;
+    }
+    toku::sort<int, const int, int_cmp>::mergesort_r(a, nelts, MAGIC_EXTRA);
+    check_int_array(a, nelts);
+    toku_free(a);
+}
+
+static void
+random_array_test(int nelts)
+{
+    int *XMALLOC_N(nelts, a);
+    int *XMALLOC_N(nelts, b);
+    for (int i = 0; i < nelts; ++i) {
+        a[i] = rand() % MAX_NUM;
+        b[i] = a[i];
+    }
+    toku::sort<int, const int, int_cmp>::mergesort_r(a, nelts, MAGIC_EXTRA);
+    check_int_array(a, nelts);
+    qsort(b, nelts, sizeof b[0], int_qsort_cmp);
+    for (int i = 0; i < nelts; ++i) {
+        assert(a[i] == b[i]);
+    }
+    toku_free(a);
+    toku_free(b);
+}
+
+static int
+uint64_qsort_cmp(const void *va, const void *vb) {
+    const uint64_t *CAST_FROM_VOIDP(a, va);
+    const uint64_t *CAST_FROM_VOIDP(b, vb);
+    return (*a > *b) - (*a < *b);
+}
+
+int uint64_cmp(const int &e, const uint64_t &a, const uint64_t &b);
+int
+uint64_cmp(const int &e, const uint64_t &a, const uint64_t &b)
+{
+    assert(e == MAGIC_EXTRA);
+    return uint64_qsort_cmp(&a, &b);
+}
+
+static void
+random_array_test_64(int nelts)
+{
+    uint64_t *XMALLOC_N(nelts, a);
+    uint64_t *XMALLOC_N(nelts, b);
+    for (int i = 0; i < nelts; ++i) {
+        a[i] = ((uint64_t)rand() << 32ULL) | rand();
+        b[i] = a[i];
+    }
+    toku::sort<uint64_t, const int, uint64_cmp>::mergesort_r(a, nelts, MAGIC_EXTRA);
+    qsort(b, nelts, sizeof b[0], uint64_qsort_cmp);
+    for (int i = 0; i < nelts; ++i) {
+        assert(a[i] == b[i]);
+    }
+    toku_free(a);
+    toku_free(b);
+}
+
+int
+test_main(int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__)))
+{
+    zero_array_test();
+    random_array_test(10);
+    random_array_test(1000);
+    random_array_test(10001);
+    random_array_test(19999);
+    random_array_test(39999);
+    random_array_test(10000000);
+    random_array_test_64(10000000);
+    dup_array_test(10);
+    dup_array_test(1000);
+    dup_array_test(10001);
+    dup_array_test(10000000);
+    already_sorted_test(10);
+    already_sorted_test(1000);
+    already_sorted_test(10001);
+    already_sorted_test(10000000);
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/test-frwlock-fair-writers.cc b/storage/tokudb/PerconaFT/util/tests/test-frwlock-fair-writers.cc
new file mode 100644
index 00000000..9a625c32
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test-frwlock-fair-writers.cc
@@ -0,0 +1,90 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+// check if write locks are fair
+
+#include <stdio.h>
+#include <toku_assert.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <util/frwlock.h>
+
+toku_mutex_t rwlock_mutex;
+toku::frwlock rwlock;
+volatile int killed = 0;
+
+static void *t1_func(void *arg) {
+    int i;
+    for (i = 0; !killed; i++) {
+        toku_mutex_lock(&rwlock_mutex);
+        rwlock.write_lock(false);
+        toku_mutex_unlock(&rwlock_mutex);
+        usleep(10000);
+        toku_mutex_lock(&rwlock_mutex);
+        rwlock.write_unlock();
+        toku_mutex_unlock(&rwlock_mutex);
+    }
+    printf("%lu %d\n", (unsigned long) pthread_self(), i);
+    return arg;
+}
+
+int main(void) {
+    int r;
+
+    toku_mutex_init(toku_uninstrumented, &rwlock_mutex, nullptr);
+    rwlock.init(&rwlock_mutex);
+
+    const int nthreads = 2;
+    pthread_t tids[nthreads];
+    for (int i = 0; i < nthreads; i++) {
+        r = pthread_create(&tids[i], NULL, t1_func, NULL); 
+        assert(r == 0);
+    }
+    sleep(10);
+    killed = 1;
+    for (int i = 0; i < nthreads; i++) {
+        void *ret;
+        r = pthread_join(tids[i], &ret);
+        assert(r == 0);
+    }
+
+    rwlock.deinit();
+    toku_mutex_destroy(&rwlock_mutex);
+
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/test-kibbutz.cc b/storage/tokudb/PerconaFT/util/tests/test-kibbutz.cc
new file mode 100644
index 00000000..5672a853
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test-kibbutz.cc
@@ -0,0 +1,91 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "test.h"
+#include <util/kibbutz.h>
+
+#include <memory.h>
+#include <stdio.h>
+
+#define ND 10
+#define NT 4
+bool done[ND];
+
+static void dowork (void *idv) {
+    int *CAST_FROM_VOIDP(idp, idv);
+    int id = *idp;
+    if (verbose) printf("s%d\n", id);
+    assert(!done[id]);
+    sleep(1);
+    done[id] = true;
+    sleep(1);
+    if (verbose) printf("d%d\n", id);
+}
+
+static void kibbutz_test (bool parent_finishes_first) {
+    KIBBUTZ k = NULL;
+    int r = toku_kibbutz_create(NT, &k);
+    assert(r == 0);
+    if (verbose) printf("create\n");
+    int ids[ND];
+    for (int i=0; i<ND; i++) {
+	done[i]=false;
+	ids[i] =i;
+    }
+    for (int i=0; i<ND; i++) {
+	if (verbose) printf("e%d\n", i);
+	toku_kibbutz_enq(k, dowork, &ids[i]);
+    }
+    if (!parent_finishes_first) {
+	sleep((ND+2*NT)/NT);
+    }
+    toku_kibbutz_destroy(k);
+    for (int i=0; i<ND; i++) assert(done[i]);
+}
+
+int
+test_main (int argc , const char *argv[]) {
+    default_parse_args(argc, argv);
+    
+    kibbutz_test(false);
+    kibbutz_test(true);
+    if (verbose) printf("test ok\n");
+    return 0;
+}
+
+
diff --git a/storage/tokudb/PerconaFT/util/tests/test-kibbutz2.cc b/storage/tokudb/PerconaFT/util/tests/test-kibbutz2.cc
new file mode 100644
index 00000000..8ccd37c3
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test-kibbutz2.cc
@@ -0,0 +1,89 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "test.h"
+#include <util/kibbutz.h>
+
+#include <memory.h>
+#include <stdio.h>
+
+#define ND 10
+bool done[ND];
+
+static void dowork (void *idv) {
+    int *CAST_FROM_VOIDP(idp, idv);
+    int id = *idp;
+    if (verbose) printf("s%d\n", id);
+    for (int i = 0; i < id; i++) {
+        assert(done[i]);
+    }
+    assert(!done[id]);
+    sleep(1);
+    done[id] = true;
+    sleep(1);
+    if (verbose) printf("d%d\n", id);
+}
+
+static void kibbutz_test (void) {
+    KIBBUTZ k = NULL;
+    int r = toku_kibbutz_create(1, &k);
+    assert(r == 0);
+    if (verbose) printf("create\n");
+    int ids[ND];
+    for (int i=0; i<ND; i++) {
+        done[i]=false;
+        ids[i] =i;
+    }
+    for (int i=0; i<ND; i++) {
+        if (verbose) printf("e%d\n", i);
+        toku_kibbutz_enq(k, dowork, &ids[i]);
+    }
+    toku_kibbutz_destroy(k);
+    for (int i=0; i<ND; i++) assert(done[i]);
+}
+
+int
+test_main (int argc , const char *argv[]) {
+    default_parse_args(argc, argv);
+    
+    kibbutz_test();
+    if (verbose) printf("test ok\n");
+    return 0;
+}
+
+
diff --git a/storage/tokudb/PerconaFT/util/tests/test-rwlock-cheapness.cc b/storage/tokudb/PerconaFT/util/tests/test-rwlock-cheapness.cc
new file mode 100644
index 00000000..c0b43c2d
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test-rwlock-cheapness.cc
@@ -0,0 +1,254 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <toku_portability.h>
+#include <toku_assert.h>
+#include <portability/toku_pthread.h>
+#include <portability/toku_time.h>
+#include <util/frwlock.h>
+#include <util/rwlock.h>
+#include "rwlock_condvar.h"
+
+// We need to manually intialize partitioned counters so that the
+// ones automatically incremented by the frwlock get handled properly.
+#include <util/partitioned_counter.h>
+
+toku_mutex_t mutex;
+toku::frwlock w;
+
+static void grab_write_lock(bool expensive) {
+    toku_mutex_lock(&mutex);
+    w.write_lock(expensive);
+    toku_mutex_unlock(&mutex);
+}
+
+static void release_write_lock(void) {
+    toku_mutex_lock(&mutex);
+    w.write_unlock();
+    toku_mutex_unlock(&mutex);
+}
+
+static void grab_read_lock(void) {
+    toku_mutex_lock(&mutex);
+    w.read_lock();
+    toku_mutex_unlock(&mutex);
+}
+
+static void release_read_lock(void) {
+    toku_mutex_lock(&mutex);
+    w.read_unlock();
+    toku_mutex_unlock(&mutex);
+}
+
+static void *do_cheap_wait(void *arg) {
+    grab_write_lock(false);
+    release_write_lock();
+    return arg;
+}
+
+static void *do_expensive_wait(void *arg) {
+    grab_write_lock(true);
+    release_write_lock();
+    return arg;
+}
+
+static void *do_read_wait(void *arg) {
+    grab_read_lock();
+    release_read_lock();
+    return arg;
+}
+
+static void launch_cheap_waiter(void) {
+    toku_pthread_t tid;
+    int r = toku_pthread_create(
+        toku_uninstrumented, &tid, nullptr, do_cheap_wait, nullptr);
+    assert_zero(r);
+    toku_pthread_detach(tid);
+    sleep(1);
+}
+
+static void launch_expensive_waiter(void) {
+    toku_pthread_t tid;
+    int r = toku_pthread_create(
+        toku_uninstrumented, &tid, nullptr, do_expensive_wait, nullptr);
+    assert_zero(r);
+    toku_pthread_detach(tid);
+    sleep(1);
+}
+
+static void launch_reader(void) {
+    toku_pthread_t tid;
+    int r = toku_pthread_create(
+        toku_uninstrumented, &tid, nullptr, do_read_wait, nullptr);
+    assert_zero(r);
+    toku_pthread_detach(tid);
+    sleep(1);
+}
+
+static bool locks_are_expensive(void) {
+    toku_mutex_lock(&mutex);
+    assert(w.write_lock_is_expensive() == w.read_lock_is_expensive());
+    bool is_expensive = w.write_lock_is_expensive();
+    toku_mutex_unlock(&mutex);
+    return is_expensive;
+}
+
+static void test_write_cheapness(void) {
+    toku_mutex_init(toku_uninstrumented, &mutex, nullptr);
+    w.init(&mutex);
+
+    // single expensive write lock
+    grab_write_lock(true);
+    assert(locks_are_expensive());
+    release_write_lock();
+    assert(!locks_are_expensive());
+
+    // single cheap write lock
+    grab_write_lock(false);
+    assert(!locks_are_expensive());
+    release_write_lock();
+    assert(!locks_are_expensive());
+
+    // multiple read locks
+    grab_read_lock();
+    assert(!locks_are_expensive());
+    grab_read_lock();
+    grab_read_lock();
+    assert(!locks_are_expensive());
+    release_read_lock();
+    release_read_lock();
+    release_read_lock();
+    assert(!locks_are_expensive());
+
+    // expensive write lock and cheap writers waiting
+    grab_write_lock(true);
+    launch_cheap_waiter();
+    assert(locks_are_expensive());
+    launch_cheap_waiter();
+    launch_cheap_waiter();
+    assert(locks_are_expensive());
+    release_write_lock();
+    sleep(1);
+    assert(!locks_are_expensive());
+
+    // cheap write lock and expensive writer waiter
+    grab_write_lock(false);
+    launch_expensive_waiter();
+    assert(locks_are_expensive());
+    release_write_lock();
+    sleep(1);
+
+    // expensive write lock and expensive waiter
+    grab_write_lock(true);
+    launch_expensive_waiter();
+    assert(locks_are_expensive());
+    release_write_lock();
+    sleep(1);
+
+    // cheap write lock and cheap waiter
+    grab_write_lock(false);
+    launch_cheap_waiter();
+    assert(!locks_are_expensive());
+    release_write_lock();
+    sleep(1);
+
+    // read lock held and cheap waiter
+    grab_read_lock();
+    launch_cheap_waiter();
+    assert(!locks_are_expensive());
+    // add expensive waiter
+    launch_expensive_waiter();
+    assert(locks_are_expensive());
+    release_read_lock();
+    sleep(1);
+
+    // read lock held and expensive waiter
+    grab_read_lock();
+    launch_expensive_waiter();
+    assert(locks_are_expensive());
+    // add expensive waiter
+    launch_cheap_waiter();
+    assert(locks_are_expensive());
+    release_read_lock();
+    sleep(1);
+
+    // cheap write lock held and waiting read
+    grab_write_lock(false);
+    launch_reader();
+    assert(!locks_are_expensive());
+    launch_expensive_waiter();
+    toku_mutex_lock(&mutex);
+    assert(w.write_lock_is_expensive());
+    // tricky case here, because we have a launched reader
+    // that should be in the queue, a new read lock
+    // should piggy back off that
+    assert(!w.read_lock_is_expensive());
+    toku_mutex_unlock(&mutex);
+    release_write_lock();
+    sleep(1);
+
+    // expensive write lock held and waiting read
+    grab_write_lock(true);
+    launch_reader();
+    assert(locks_are_expensive());
+    launch_cheap_waiter();
+    assert(locks_are_expensive());
+    release_write_lock();
+    sleep(1);
+
+    w.deinit();
+    toku_mutex_destroy(&mutex);
+}
+
+int main (int UU(argc), const char* UU(argv[])) {
+    // Ultra ugly. We manually init/destroy partitioned counters
+    // and context because normally toku_ft_layer_init() does that
+    // for us, but we don't want to initialize everything.
+    partitioned_counters_init();
+    toku_context_status_init();
+    test_write_cheapness();
+    toku_context_status_destroy();
+    partitioned_counters_destroy();
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/test-rwlock-unfair-writers.cc b/storage/tokudb/PerconaFT/util/tests/test-rwlock-unfair-writers.cc
new file mode 100644
index 00000000..0d1fc855
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test-rwlock-unfair-writers.cc
@@ -0,0 +1,98 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+// check if write locks are fair
+
+#include <stdio.h>
+#include <assert.h>
+#include <unistd.h>
+#include <pthread.h>
+
+pthread_rwlock_t rwlock;
+volatile int killed = 0;
+
+static void *t1_func(void *arg) {
+    int i;
+    for (i = 0; !killed; i++) {
+        int r;
+        r = pthread_rwlock_wrlock(&rwlock); 
+        assert(r == 0);
+        usleep(10000);
+        r = pthread_rwlock_unlock(&rwlock);
+        assert(r == 0);
+    }
+    printf("%lu %d\n", (unsigned long) pthread_self(), i);
+    return arg;
+}
+
+int main(void) {
+    int r;
+#if 0
+    rwlock = PTHREAD_RWLOCK_INITIALIZER;
+#endif
+#if 0
+    pthread_rwlockattr_t attr;
+    pthread_rwlockattr_init(&attr);
+    pthread_rwlockattr_setkind_np(&attr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+    r = pthread_rwlock_init(&rwlock, &attr);
+#endif
+#if 0
+    pthread_rwlockattr_t attr;
+    pthread_rwlockattr_init(&attr);
+    r = pthread_rwlock_init(&rwlock, &attr);
+#endif
+#if 1
+    r = pthread_rwlock_init(&rwlock, NULL);
+    assert(r == 0);
+#endif
+    
+    const int nthreads = 2;
+    pthread_t tids[nthreads];
+    for (int i = 0; i < nthreads; i++) {
+        r = pthread_create(&tids[i], NULL, t1_func, NULL); 
+        assert(r == 0);
+    }
+    sleep(10);
+    killed = 1;
+    for (int i = 0; i < nthreads; i++) {
+        void *ret;
+        r = pthread_join(tids[i], &ret);
+        assert(r == 0);
+    }
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/test-rwlock.cc b/storage/tokudb/PerconaFT/util/tests/test-rwlock.cc
new file mode 100644
index 00000000..56dd3f6b
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test-rwlock.cc
@@ -0,0 +1,403 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+// Here are some timing numbers:
+// (Note: The not-quite-working version with cas can be found in r22519 of https://svn.tokutek.com/tokudb/toku/tokudb.2825/)  It's about as fast as "Best cas".)
+//
+// On ramie (2.53GHz E5540)
+//  Best nop           time=  1.074300ns
+//  Best cas           time=  8.595600ns
+//  Best mutex         time= 19.340201ns
+//  Best rwlock        time= 34.024799ns
+//  Best util rwlock time= 38.680500ns
+//  Best prelocked     time=  2.148700ns
+//  Best fair rwlock   time= 45.127600ns
+// On laptop
+//  Best nop           time=  2.876000ns
+//  Best cas           time= 15.362500ns
+//  Best mutex         time= 51.951498ns
+//  Best rwlock        time= 97.721201ns
+//  Best util rwlock time=110.456800ns
+//  Best prelocked     time=  4.240100ns
+//  Best fair rwlock   time=113.119102ns
+//
+// Analysis:  If the mutex can be prelocked (as cachetable does, it uses the same mutex for the cachetable and for the condition variable protecting the cache table)
+//  then you can save quite a bit.  What does the cachetable do?
+//  During pin:   (In the common case:) It grabs the mutex, grabs a read lock,  and releases the mutex.
+//  During unpin: It grabs the mutex, unlocks the rwlock lock in the pair, and releases the mutex. 
+//  Both actions must acquire a cachetable lock during that time, so definitely saves time to do it that way.
+
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <toku_portability.h>
+#include <toku_assert.h>
+#include <portability/toku_atomic.h>
+#include <portability/toku_pthread.h>
+#include <portability/toku_time.h>
+#include <util/frwlock.h>
+#include <util/rwlock.h>
+#include "rwlock_condvar.h"
+
+static int verbose=1;
+static int timing_only=0;
+
+static void parse_args (int argc, const char *argv[]) {
+    const char *progname = argv[0];
+    argc--; argv++;
+    while (argc>0) {
+	if (strcmp(argv[0], "-v")==0) {
+	    verbose++;
+	} else if (strcmp(argv[0], "-q")==0) {
+	    verbose--;
+	} else if (strcmp(argv[0], "--timing-only")==0) {
+	    timing_only=1;
+	} else {
+	    fprintf(stderr, "Usage: %s {-q}* {-v}* {--timing-only}\n", progname);
+	    exit(1);
+	}
+	argc--; argv++;
+    }
+}
+
+static const int T=6;
+static const int N=10000000;
+
+static double best_nop_time=1e12;
+static double best_fcall_time=1e12;
+static double best_cas_time=1e12;
+static double best_mutex_time=1e12;
+static double best_rwlock_time=1e12;
+static double best_util_time=1e12;
+static double best_prelocked_time=1e12;
+static double best_frwlock_time=1e12;
+static double best_frwlock_prelocked_time=1e12;
+static double mind(double a, double b) { if (a<b) return a; else return b; }
+
+#if 0
+// gcc 4.4.4 (fedora 12) doesn't introduce memory barriers on these writes, so I think that volatile is not enough for sequential consistency.
+// Intel guarantees that writes are seen in the same order as they were performed on one processor.  But if there were two processors, funny things could happen.
+volatile int sc_a, sc_b;
+void sequential_consistency (void) {
+    sc_a = 1;
+    sc_b = 0;
+}
+#endif
+    
+// Declaring val to be volatile produces essentially identical code as putting the asm volatile memory statements in.
+// gcc is not introducing memory barriers to force sequential consistency on volatile memory writes.
+// That's probably good enough for us, since we'll have a barrier instruction anywhere it matters.
+volatile int val = 0;
+
+static void time_nop (void) __attribute((__noinline__)); // don't want it inline, because it messes up timing.
+static void time_nop (void) {
+    struct timeval start,end;
+    for (int t=0; t<T; t++) {
+	gettimeofday(&start, NULL);
+	for (int i=0; i<N; i++) {
+	    if (val!=0) abort();
+	    val=1;
+	    //__asm__ volatile ("" : : : "memory");
+	    val=0;
+	    //__asm__ volatile ("" : : : "memory");
+	}
+	gettimeofday(&end,   NULL);
+	double diff = 1e9*toku_tdiff(&end, &start)/N;
+	if (verbose>1)
+	    fprintf(stderr, "nop               = %.6fns/(lock+unlock)\n", diff);
+	best_nop_time=mind(best_nop_time,diff);
+    }
+}
+
+// This function is defined so we can measure the cost of a function call.
+int fcall_nop (int i) __attribute__((__noinline__));
+int fcall_nop (int i) {
+    return i;
+}
+
+void time_fcall (void) __attribute((__noinline__));
+void time_fcall (void) {
+    struct timeval start,end;
+    for (int t=0; t<T; t++) {
+	gettimeofday(&start, NULL);
+	for (int i=0; i<N; i++) {
+	    fcall_nop(i);
+	}
+	gettimeofday(&end,   NULL);
+	double diff = 1e9*toku_tdiff(&end, &start)/N;
+	if (verbose>1)
+	    fprintf(stderr, "fcall             = %.6fns/(lock+unlock)\n", diff);
+	best_fcall_time=mind(best_fcall_time,diff);
+    }
+}
+
+void time_cas (void) __attribute__((__noinline__));
+void time_cas (void) {
+    volatile int64_t tval = 0;
+    struct timeval start,end;
+    for (int t=0; t<T; t++) {
+	gettimeofday(&start, NULL);
+	for (int i=0; i<N; i++) {
+	    { int r = toku_sync_val_compare_and_swap(&tval, 0, 1);  assert(r==0); }
+	    { int r = toku_sync_val_compare_and_swap(&tval, 1, 0);  assert(r==1); }
+	}
+	gettimeofday(&end,   NULL);
+	double diff = 1e9*toku_tdiff(&end, &start)/N;
+	if (verbose>1)
+	    fprintf(stderr, "cas           = %.6fns/(lock+unlock)\n", diff);
+	best_cas_time=mind(best_cas_time,diff);
+    }
+}
+
+
+void time_pthread_mutex (void) __attribute__((__noinline__));
+void time_pthread_mutex (void) {
+    pthread_mutex_t mutex;
+    { int r = pthread_mutex_init(&mutex, NULL); assert(r==0); }
+    struct timeval start,end;
+    pthread_mutex_lock(&mutex);
+    pthread_mutex_unlock(&mutex);
+    for (int t=0; t<T; t++) {
+	gettimeofday(&start, NULL);
+	for (int i=0; i<N; i++) {
+	    pthread_mutex_lock(&mutex);
+	    pthread_mutex_unlock(&mutex);
+	}
+	gettimeofday(&end,   NULL);
+	double diff = 1e9*toku_tdiff(&end, &start)/N;
+	if (verbose>1)
+	    fprintf(stderr, "pthread_mutex     = %.6fns/(lock+unlock)\n", diff);
+	best_mutex_time=mind(best_mutex_time,diff);
+    }
+    { int r = pthread_mutex_destroy(&mutex);    assert(r==0); }
+}
+
+void time_pthread_rwlock (void) __attribute__((__noinline__));
+void time_pthread_rwlock (void) {
+    pthread_rwlock_t mutex;
+    { int r = pthread_rwlock_init(&mutex, NULL); assert(r==0); }
+    struct timeval start,end;
+    pthread_rwlock_rdlock(&mutex);
+    pthread_rwlock_unlock(&mutex);
+    for (int t=0; t<T; t++) {
+	gettimeofday(&start, NULL);
+	for (int i=0; i<N; i++) {
+	    pthread_rwlock_rdlock(&mutex);
+	    pthread_rwlock_unlock(&mutex);
+	}
+	gettimeofday(&end,   NULL);
+	double diff = 1e9*toku_tdiff(&end, &start)/N;
+	if (verbose>1)
+	    fprintf(stderr, "pthread_rwlock(r) = %.6fns/(lock+unlock)\n", diff);
+	best_rwlock_time=mind(best_rwlock_time,diff);
+    }
+    { int r = pthread_rwlock_destroy(&mutex);    assert(r==0); }
+}
+
+static void util_rwlock_lock (RWLOCK rwlock, toku_mutex_t *mutex) {
+    toku_mutex_lock(mutex);
+    rwlock_read_lock(rwlock, mutex);
+    toku_mutex_unlock(mutex);
+}
+
+static void util_rwlock_unlock (RWLOCK rwlock, toku_mutex_t *mutex) {
+    toku_mutex_lock(mutex);
+    rwlock_read_unlock(rwlock);
+    toku_mutex_unlock(mutex);
+}
+
+// Time the read lock that's in util/rwlock.h
+void time_util_rwlock(void) __attribute((__noinline__));
+void time_util_rwlock(void) {
+    struct st_rwlock rwlock;
+    toku_mutex_t external_mutex;
+    toku_mutex_init(toku_uninstrumented, &external_mutex, nullptr);
+    rwlock_init(toku_uninstrumented, &rwlock);
+    struct timeval start, end;
+
+    util_rwlock_lock(&rwlock, &external_mutex);
+    util_rwlock_unlock(&rwlock, &external_mutex);
+    for (int t=0; t<T; t++) {
+	gettimeofday(&start, NULL);
+	for (int i=0; i<N; i++) {
+	    util_rwlock_lock(&rwlock, &external_mutex);
+	    util_rwlock_unlock(&rwlock, &external_mutex);
+	}
+	gettimeofday(&end,   NULL);
+	double diff = 1e9*toku_tdiff(&end, &start)/N;
+	if (verbose>1)
+	    fprintf(stderr, "util_rwlock(r) = %.6fns/(lock+unlock)\n", diff);
+	best_util_time=mind(best_util_time,diff);
+    }
+    rwlock_destroy(&rwlock);
+    toku_mutex_destroy(&external_mutex);
+}
+
+// Time the read lock that's in util/rwlock.h, assuming the mutex is already
+// held.
+void time_util_prelocked_rwlock(void) __attribute__((__noinline__));
+void time_util_prelocked_rwlock(void) {
+    struct st_rwlock rwlock;
+    toku_mutex_t external_mutex;
+    toku_mutex_init(toku_uninstrumented, &external_mutex, nullptr);
+    toku_mutex_lock(&external_mutex);
+    rwlock_init(toku_uninstrumented, &rwlock);
+    struct timeval start, end;
+
+    rwlock_read_lock(&rwlock, &external_mutex);
+    rwlock_read_unlock(&rwlock);
+    for (int t=0; t<T; t++) {
+	gettimeofday(&start, NULL);
+	for (int i=0; i<N; i++) {
+	    rwlock_read_lock(&rwlock, &external_mutex);
+	    rwlock_read_unlock(&rwlock);
+	}
+	gettimeofday(&end,   NULL);
+	double diff = 1e9*toku_tdiff(&end, &start)/N;
+	if (verbose>1)
+	    fprintf(stderr, "pre_util_rwlock(r) = %.6fns/(lock+unlock)\n", diff);
+	best_prelocked_time=mind(best_prelocked_time,diff);
+    }
+    rwlock_destroy(&rwlock);
+    toku_mutex_unlock(&external_mutex);
+    toku_mutex_destroy(&external_mutex);
+}
+
+void time_frwlock_prelocked(void) __attribute__((__noinline__));
+void time_frwlock_prelocked(void) {
+    toku_mutex_t external_mutex;
+    toku_mutex_init(toku_uninstrumented, &external_mutex, nullptr);
+    struct timeval start, end;
+    toku::frwlock x;
+    x.init(&external_mutex);
+    toku_mutex_lock(&external_mutex);
+    bool got_lock;
+    x.read_lock();
+    x.read_unlock();
+
+    got_lock = x.try_read_lock();
+    invariant(got_lock);
+    x.read_unlock();
+    x.write_lock(true);
+    x.write_unlock();
+    got_lock = x.try_write_lock(true);
+    invariant(got_lock);
+    x.write_unlock();
+    for (int t=0; t<T; t++) {
+	gettimeofday(&start, NULL);
+	for (int i=0; i<N; i++) {
+	    x.read_lock();
+	    x.read_unlock();
+	}
+	gettimeofday(&end,   NULL);
+	double diff = 1e9*toku_tdiff(&end, &start)/N;
+	if (verbose>1)
+	    fprintf(stderr, "frwlock_prelocked = %.6fns/(lock+unlock)\n", diff);
+        best_frwlock_prelocked_time=mind(best_frwlock_prelocked_time,diff);
+    }
+    x.deinit();
+    toku_mutex_unlock(&external_mutex);
+    toku_mutex_destroy(&external_mutex);
+}
+
+void time_frwlock(void) __attribute__((__noinline__));
+void time_frwlock(void) {
+    toku_mutex_t external_mutex;
+    toku_mutex_init(toku_uninstrumented, &external_mutex, nullptr);
+    struct timeval start, end;
+    toku::frwlock x;
+    x.init(&external_mutex);
+    toku_mutex_lock(&external_mutex);
+    x.read_lock();
+    x.read_unlock();
+    toku_mutex_unlock(&external_mutex);
+    for (int t=0; t<T; t++) {
+	gettimeofday(&start, NULL);
+        for (int i=0; i<N; i++) {
+            toku_mutex_lock(&external_mutex);
+            x.read_lock();
+            toku_mutex_unlock(&external_mutex);
+
+            toku_mutex_lock(&external_mutex);
+            x.read_unlock();
+            toku_mutex_unlock(&external_mutex);
+        }
+	gettimeofday(&end,   NULL);
+	double diff = 1e9*toku_tdiff(&end, &start)/N;
+	if (verbose>1)
+	    fprintf(stderr, "frwlock           = %.6fns/(lock+unlock)\n", diff);
+        best_frwlock_time=mind(best_frwlock_time,diff);
+    }
+    x.deinit();
+    toku_mutex_destroy(&external_mutex);
+}
+
+int main (int argc, const char *argv[]) {
+    parse_args(argc, argv);
+    if (timing_only) {
+        if (1) { // to make it easy to only time the templated frwlock
+            time_nop();
+            time_fcall();
+            time_cas();
+            time_pthread_mutex();
+            time_pthread_rwlock();
+            time_util_rwlock();
+            time_util_prelocked_rwlock();
+        }
+	time_frwlock();
+	time_frwlock_prelocked();
+	if (verbose>0) {
+            if (1) { // to make it easy to only time the templated frwlock
+                printf("//  Best nop              time=%10.6fns\n", best_nop_time);
+                printf("//  Best fcall            time=%10.6fns\n", best_fcall_time);
+                printf("//  Best cas              time=%10.6fns\n", best_cas_time);
+                printf("//  Best mutex            time=%10.6fns\n", best_mutex_time);
+                printf("//  Best rwlock           time=%10.6fns\n", best_rwlock_time);
+                printf("//  Best util rwlock      time=%10.6fns\n", best_util_time);
+                printf("//  Best prelocked        time=%10.6fns\n", best_prelocked_time);
+            }
+            printf("//  Best frwlock         time=%10.6fns\n", best_frwlock_time);
+            printf("//  Best frwlock_pre     time=%10.6fns\n", best_frwlock_prelocked_time);
+	}
+    }
+    return 0;
+}
+
diff --git a/storage/tokudb/PerconaFT/util/tests/test.h b/storage/tokudb/PerconaFT/util/tests/test.h
new file mode 100644
index 00000000..fdd2d3f6
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test.h
@@ -0,0 +1,84 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <stdlib.h>
+#include <toku_portability.h>
+#include <toku_assert.h>
+#include <util/partitioned_counter.h>
+#include <string.h>
+
+#define CKERR(r) ({ int __r = r; if (__r!=0) fprintf(stderr, "%s:%d error %d %s\n", __FILE__, __LINE__, __r, strerror(r)); assert(__r==0); })
+#define CKERR2(r,r2) do { if (r!=r2) fprintf(stderr, "%s:%d error %d %s, expected %d\n", __FILE__, __LINE__, r, strerror(r), r2); assert(r==r2); } while (0)
+#define CKERR2s(r,r2,r3) do { if (r!=r2 && r!=r3) fprintf(stderr, "%s:%d error %d %s, expected %d or %d\n", __FILE__, __LINE__, r, strerror(r), r2,r3); assert(r==r2||r==r3); } while (0)
+
+#define DEBUG_LINE do { \
+    fprintf(stderr, "%s() %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \
+    fflush(stderr); \
+} while (0)
+
+static int verbose;
+
+static inline void
+default_parse_args (int argc, const char *argv[]) {
+    const char *progname=argv[0];
+    argc--; argv++;
+    while (argc>0) {
+        if (strcmp(argv[0],"-v")==0) {
+            ++verbose;
+        } else if (strcmp(argv[0],"-q")==0) {
+            verbose=0;
+        } else {
+            fprintf(stderr, "Usage:\n %s [-v] [-q]\n", progname);
+            exit(1);
+        }
+        argc--; argv++;
+    }
+}
+
+int test_main(int argc, const char *argv[]);
+
+int
+main(int argc, const char *argv[]) {
+    int ri = toku_portability_init();
+    assert(ri==0);
+    partitioned_counters_init();
+    int r = test_main(argc, argv);
+    partitioned_counters_destroy();
+    toku_portability_destroy();
+    return r;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/test_doubly_linked_list.cc b/storage/tokudb/PerconaFT/util/tests/test_doubly_linked_list.cc
new file mode 100644
index 00000000..cac17f1a
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test_doubly_linked_list.cc
@@ -0,0 +1,184 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "test.h"
+#include <stdlib.h>
+#include <util/doubly_linked_list.h>
+
+using namespace toku;
+
+static void check_is_empty (DoublyLinkedList<int> *l) {
+    LinkedListElement<int> *re;
+    bool r = l->pop(&re);
+    assert(!r);
+}
+
+static void test_doubly_linked_list (void) {
+    DoublyLinkedList<int> l;
+    l.init();
+    LinkedListElement<int> e0, e1;
+
+    l.insert(&e0, 3);
+    {
+	LinkedListElement<int> *re;
+	bool r = l.pop(&re);
+	assert(r);
+	assert(re==&e0);
+	assert(re->get_container()==3);
+    }
+    check_is_empty(&l);
+
+    l.insert(&e0, 0);
+    l.insert(&e1, 1);
+    {
+	bool in[2]={true,true};
+	for (int i=0; i<2; i++) {
+	    LinkedListElement<int> *re;
+	    bool r = l.pop(&re);
+	    assert(r);
+	    int  v = re->get_container();
+	    assert(v==0 || v==1);
+	    assert(in[v]);
+	    in[v]=false;
+	}
+    }
+    check_is_empty(&l);
+}
+
+const int N=100;
+bool in[N];
+DoublyLinkedList<int> l;
+LinkedListElement<int> elts[N];
+
+static void maybe_insert_random(void) {
+    int x = random()%N;
+    if (!in[x]) {
+	if (verbose) printf("I%d ", x);
+	l.insert(&elts[x], x);
+	in[x]=true;
+    }
+}
+
+static bool checked[N];
+static int  check_count;
+static int check_is_in(int v, int deadbeef) {
+    assert(deadbeef=0xdeadbeef);
+    assert(0<=v && v<N);
+    assert(!checked[v]);
+    assert(in[v]);
+    checked[v]=true;
+    check_count++;
+    return 0;
+}
+static int quit_count=0;
+static int quit_early(int v __attribute__((__unused__)), int beefbeef) {
+    assert(beefbeef=0xdeadbeef);
+    quit_count++;
+    if (quit_count==check_count) return check_count;
+    else return 0;
+}
+
+static void check_equal(void) {
+    check_count=0;
+    for (int i=0; i<N; i++) checked[i]=false;
+    {
+	int r = l.iterate<int>(check_is_in, 0xdeadbeef);
+	assert(r==0);
+    }
+    for (int i=0; i<N; i++) assert(checked[i]==in[i]);
+
+    if (check_count>0) {
+	check_count=1+random()%check_count; // quit after 1 or more iterations
+	quit_count=0;
+	int r = l.iterate<int>(quit_early, 0xbeefbeef);
+	assert(r==check_count);
+    }
+}
+
+static void test_doubly_linked_list_randomly(void) {
+    l.init();
+    for (int i=0; i<N; i++) in[i]=false;
+
+    for (int i=0; i<N/2; i++) maybe_insert_random();
+    if (verbose) printf("\n");
+
+    for (int i=0; i<N*N; i++) {
+	int x = random()%N;
+	if (in[x]) {
+	    if (random()%2==0) {
+		if (verbose) printf("%dR%d ", i, x);
+		l.remove(&elts[x]);
+		in[x]=false;
+	    } else {
+		LinkedListElement<int> *re;
+		bool r = l.pop(&re);
+		assert(r);
+		int v = re->get_container();
+		assert(in[v]);
+		in[v]=false;
+		if (verbose) printf("%dP%d ", i, v);
+	    }
+	} else {
+	    l.insert(&elts[x], x);
+	    in[x]=true;
+	    if (verbose) printf("%dI%d ", i, x);
+	}
+
+	check_equal();
+    }
+    if (verbose) printf("\n");
+
+    LinkedListElement<int> *re;
+    while (l.pop(&re)) {
+	int v = re->get_container();
+	assert(in[v]);
+	in[v]=false;
+	if (verbose) printf("P%d ", v);
+    }
+    for (int i=0; i<N; i++) assert(!in[i]);
+    if (verbose) printf("\n");
+}
+
+int test_main (int argc, const char *argv[]) {
+    default_parse_args(argc, argv);
+    test_doubly_linked_list();
+    for (int i=0; i<4; i++) {
+	test_doubly_linked_list_randomly();
+    }
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/test_partitioned_counter.cc b/storage/tokudb/PerconaFT/util/tests/test_partitioned_counter.cc
new file mode 100644
index 00000000..a4e6f842
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test_partitioned_counter.cc
@@ -0,0 +1,416 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+/* This code can either test the PARTITIONED_COUNTER abstraction or it can time various implementations. */
+
+/* Try to make counter that requires no cache misses to increment, and to get the value can be slow.
+ * I don't care much about races between the readers and writers on the counter.
+ *
+ * The problem: We observed that incrementing a counter with multiple threads is quite expensive.
+ * Here are some performance numbers:
+ * Machines:  mork or mindy (Intel Xeon L5520 2.27GHz)
+ *            bradley's 4-core laptop laptop (Intel Core i7-2640M 2.80GHz) sandy bridge
+ *            alf       16-core server (xeon E5-2665 2.4GHz) sandybridge
+ *
+ *      mork  mindy  bradley  alf
+ *     1.22ns  1.07ns  1.27ns  0.61ns   to do a ++, but it's got a race in it.
+ *    27.11ns 20.47ns 18.75ns 34.15ns   to do a sync_fetch_and_add().
+ *     0.26ns  0.29ns  0.71ns  0.19ns   to do with a single version of a counter
+ *     0.35ns  0.33ns  0.69ns  0.18ns   pure thread-local variable (no way to add things up)
+ *             0.76ns  1.50ns  0.35ns   partitioned_counter.c (using link-time optimization, otherwise the function all overwhelms everything)
+ *     2.21ns          3.32ns  0.70ns   partitioned_counter.c (using gcc, the C version at r46097, not C++)  This one is a little slower because it has an extra branch in it.
+ * 
+ * Surprisingly, compiling this code without -fPIC doesn't make it any faster (even the pure thread-local variable is the same).  -fPIC access to
+ * thread-local variables look slower since they have a function all, but they don't seem to be any slower in practice.  In fact, even the puretl-ptr test
+ * which simply increments a thread-local pointer is basically the same speed as accessing thread_local variable.
+ * 
+ * How it works.  Each thread has a thread-local counter structure with an integer in it.  To increment, we increment the thread-local structure.
+ *   The other operation is to query the counters to get the sum of all the thread-local variables.
+ *   The first time a pthread increments the variable we add the variable to a linked list.
+ *   When a pthread ends, we use the pthread_key destructor to remove the variable from the linked list.  We also have to remember the sum of everything.
+ *    that has been removed from the list.
+ *   To get the sum we add the sum of the destructed items, plus everything in the list.
+ *
+ */
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <toku_race_tools.h>
+#include <toku_assert.h>
+#include <portability/toku_atomic.h>
+#include <memory.h>
+#include <util/partitioned_counter.h>
+#include "test.h"
+
+// The test code includes the fastest version I could figure out to make, implemented below.
+
+struct counter_s {
+    bool inited;
+    volatile int counter;
+    struct counter_s *prev, *next;
+    int myid;
+};
+static __thread struct counter_s counter = {false,0, NULL,NULL,0};
+
+static int finished_counter=0; // counter for all threads that are done.
+
+// We use a single mutex for anything complex.  We'd like to use a mutex per partitioned counter, but we must cope with the possibility of a race between
+// a terminating pthread (which calls destroy_counter()), and a call to the counter destructor.  So we use a global mutex.
+static pthread_mutex_t pc_mutex = PTHREAD_MUTEX_INITIALIZER;
+static struct counter_s *head=NULL;
+static pthread_key_t   counter_key;
+
+static void pc_lock (void)
+// Effect: Lock the pc mutex.  
+{
+    int r = pthread_mutex_lock(&pc_mutex);
+    assert(r==0);
+}
+
+static void pc_unlock (void)
+// Effect: Unlock the pc mutex.
+{
+    int r = pthread_mutex_unlock(&pc_mutex);
+    assert(r==0);
+}
+
+static void destroy_counter (void *counterp)
+// Effect: This is the function passed to pthread_key_create that is to run whenever a thread terminates.
+//   The thread-local part of the counter must be copied into the shared state, and the thread-local part of the counter must be
+//   removed from the linked list of all thread-local parts.
+{
+    assert((struct counter_s*)counterp==&counter);
+    pc_lock();
+    if (counter.prev==NULL) {
+	assert(head==&counter);
+	head = counter.next;
+    } else {
+	counter.prev->next = counter.next;
+    }
+    if (counter.next!=NULL) {
+	counter.next->prev = counter.prev;
+    }
+    finished_counter += counter.counter;
+    TOKU_VALGRIND_HG_ENABLE_CHECKING(&counter.counter, sizeof(counter.counter)); // stop ignoring races
+    //printf("finished counter now %d\n", finished_counter);
+    pc_unlock();
+}
+
+static int idcounter=0;
+
+static inline void increment (void) {
+    if (!counter.inited) {
+        pc_lock();
+        struct counter_s *cp = &counter;
+	{ int r = pthread_setspecific(counter_key, cp); assert(r==0); }
+	cp->prev = NULL;
+	cp->next = head;
+	if (head!=NULL) {
+	    head->prev = cp;
+	}
+        head = cp;
+	cp->counter = 0;
+	cp->inited = true;
+	cp->myid = idcounter++;
+	TOKU_VALGRIND_HG_DISABLE_CHECKING(&counter.counter, sizeof(counter.counter)); // the counter increment is kind of racy.
+        pc_unlock();
+    }
+    counter.counter++;
+}
+
+static int getvals (void) {
+    pc_lock();
+    int sum=finished_counter;
+    for (struct counter_s *p=head; p; p=p->next) {
+	sum+=p->counter;
+    }
+    pc_unlock();
+    return sum;
+}
+    
+/**********************************************************************************/
+/* And now for some actual test code.                                             */
+/**********************************************************************************/
+
+static const int N=10000000;
+static const int T=20;
+
+
+PARTITIONED_COUNTER pc;
+static void *pc_doit (void *v) {
+    for (int i=0; i<N; i++) {
+	increment_partitioned_counter(pc, 1);
+    }
+    //printf("val=%ld\n", read_partitioned_counter(pc));
+    return v;
+}
+
+static void* new_doit (void* v) {
+    for (int i=0; i<N; i++) {
+	increment();
+	//if (i%0x2000 == 0) sched_yield();
+    }
+    if (0) printf("done id=%d, getvals=%d\n", counter.myid, getvals());
+    return v;
+}
+
+static int oldcounter=0;
+
+static void* old_doit (void* v) {
+    for (int i=0; i<N; i++) {
+	(void)toku_sync_fetch_and_add(&oldcounter, 1);
+	//if (i%0x1000 == 0) sched_yield();
+    }
+    return v;
+}
+
+static volatile int oldcounter_nonatomic=0;
+
+static void* old_doit_nonatomic (void* v) {
+    for (int i=0; i<N; i++) {
+	oldcounter_nonatomic++;
+	//if (i%0x1000 == 0) sched_yield();
+    }
+    return v;
+}
+
+static __thread volatile int thread_local_counter=0;
+static void* tl_doit (void *v) {
+    for (int i=0; i<N; i++) {
+	thread_local_counter++;
+    }
+    return v;
+}
+
+static float tdiff (struct timeval *start, struct timeval *end) {
+    return (end->tv_sec-start->tv_sec) +1e-6*(end->tv_usec - start->tv_usec);
+}
+
+static void pt_create (pthread_t *thread, void *(*f)(void*), void *extra) {
+    int r = pthread_create(thread, NULL, f, extra);
+    assert(r==0);
+}
+
+static void pt_join (pthread_t thread, void *expect_extra) {
+    void *result;
+    int r = pthread_join(thread, &result);
+    assert(r==0);
+    assert(result==expect_extra);
+}
+
+static void timeit (const char *description, void* (*f)(void*)) {
+    struct timeval start, end;
+    pthread_t threads[T];
+    gettimeofday(&start, 0);
+    for (int i=0; i<T; i++) {
+	pt_create(&threads[i], f, NULL);
+    }
+    for (int i=0; i<T; i++) {
+	pt_join(threads[i], NULL);
+    }
+    gettimeofday(&end, 0);
+    printf("%-10s Time=%.6fs (%7.3fns per increment)\n", description, tdiff(&start, &end), (1e9*tdiff(&start, &end)/T)/N);
+}
+
+// Do a measurement where it really is only a pointer dereference to increment the variable, which is thread local.
+static void* tl_doit_ptr (void *v) {
+    volatile uint64_t *p = (uint64_t *)v;
+    for (int i=0; i<N; i++) {
+	(*p)++;
+    }
+    return v;
+}
+
+
+static void timeit_with_thread_local_pointer (const char *description, void* (*f)(void*)) {
+    struct timeval start, end;
+    pthread_t threads[T];
+    struct { uint64_t values[8] __attribute__((__aligned__(64))); } values[T]; // pad to different cache lines.
+    gettimeofday(&start, 0);
+    for (int i=0; i<T; i++) {
+        values[i].values[0]=0;
+	pt_create(&threads[i], f, &values[i].values[0]);
+    }
+    for (int i=0; i<T; i++) {
+	pt_join(threads[i], &values[i].values[0]);
+    }
+    gettimeofday(&end, 0);
+    printf("%-10s Time=%.6fs (%7.3fns per increment)\n", description, tdiff(&start, &end), (1e9*tdiff(&start, &end)/T)/N);
+}
+
+static int verboseness_cmdarg=0;
+static bool time_cmdarg=false;
+
+static void parse_args (int argc, const char *argv[]) {
+    const char *progname = argv[1];
+    argc--; argv++;
+    while (argc>0) {
+	if (strcmp(argv[0], "-v")==0) verboseness_cmdarg++;
+	else if (strcmp(argv[0], "--time")==0) time_cmdarg=true;
+	else {
+	    printf("Usage: %s [-v] [--time]\n Default is to run tests.  --time produces timing output.\n", progname);
+	    exit(1);
+	}
+	argc--; argv++;
+    }
+}
+
+static void do_timeit (void) {
+    { int r = pthread_key_create(&counter_key, destroy_counter); assert(r==0); } 
+    printf("%d threads\n%d increments per thread\n", T, N);
+    timeit("++",         old_doit_nonatomic);
+    timeit("atomic++",   old_doit);
+    timeit("fast",       new_doit);
+    timeit("puretl",     tl_doit);
+    timeit_with_thread_local_pointer("puretl-ptr", tl_doit_ptr);
+    pc = create_partitioned_counter();
+    timeit("pc",       pc_doit);
+    destroy_partitioned_counter(pc);
+}
+
+struct test_arguments {
+    PARTITIONED_COUNTER pc;
+    uint64_t            limit;
+    uint64_t            total_increment_per_writer;
+    volatile uint64_t   unfinished_count;
+};
+
+static void *reader_test_fun (void *ta_v) {
+    struct test_arguments *ta = (struct test_arguments *)ta_v;
+    uint64_t lastval = 0;
+    while (ta->unfinished_count>0) {
+	uint64_t thisval = read_partitioned_counter(ta->pc);
+	assert(lastval <= thisval);
+	assert(thisval <= ta->limit+2);
+	lastval = thisval;
+	if (verboseness_cmdarg && (0==(thisval & (thisval-1)))) printf("ufc=%" PRIu64 " Thisval=%" PRIu64 "\n", ta->unfinished_count,thisval);
+    }
+    uint64_t thisval = read_partitioned_counter(ta->pc);
+    assert(thisval==ta->limit+2); // we incremented two extra times in the test
+    return ta_v;
+}
+
+static void *writer_test_fun (void *ta_v) {
+    struct test_arguments *ta = (struct test_arguments *)ta_v;
+    for (uint64_t i=0; i<ta->total_increment_per_writer; i++) {
+	if (i%1000 == 0) sched_yield();
+	increment_partitioned_counter(ta->pc, 1);
+    }
+    uint64_t c __attribute__((__unused__)) = toku_sync_fetch_and_sub(&ta->unfinished_count, 1);
+    return ta_v;
+}
+    
+
+static void do_testit (void) {
+    const int NGROUPS = 2;
+    uint64_t limits[NGROUPS];
+    limits [0] = 2000000;
+    limits [1] = 1000000;
+    uint64_t n_writers[NGROUPS];
+    n_writers[0] = 20;
+    n_writers[1] = 40;
+    struct test_arguments tas[NGROUPS];
+    pthread_t reader_threads[NGROUPS];
+    pthread_t *writer_threads[NGROUPS];
+    for (int i=0; i<NGROUPS; i++) {
+        tas[i].pc                         = create_partitioned_counter();
+	tas[i].limit                      = limits[i];
+	tas[i].unfinished_count           = n_writers[i];
+	tas[i].total_increment_per_writer = limits[i]/n_writers[i];
+	assert(tas[i].total_increment_per_writer * n_writers[i] == limits[i]);
+	pt_create(&reader_threads[i], reader_test_fun, &tas[i]);
+        increment_partitioned_counter(tas[i].pc, 1); // make sure that the long-lived thread also increments the partitioned counter, to test for #5321.
+	MALLOC_N(n_writers[i], writer_threads[i]);
+	for (uint64_t j=0; j<n_writers[i] ; j++) {
+	    pt_create(&writer_threads[i][j], writer_test_fun, &tas[i]);
+	}
+        increment_partitioned_counter(tas[i].pc, 1); // make sure that the long-lived thread also increments the partitioned counter, to test for #5321.
+    }
+    for (int i=0; i<NGROUPS; i++) {
+	pt_join(reader_threads[i], &tas[i]);
+	for (uint64_t j=0; j<n_writers[i] ; j++) {
+	    pt_join(writer_threads[i][j], &tas[i]);
+	}
+	toku_free(writer_threads[i]);
+        destroy_partitioned_counter(tas[i].pc);
+    }
+}
+
+volatile int spinwait=0;
+static void* test2_fun (void* mypc_v) {
+    PARTITIONED_COUNTER mypc = (PARTITIONED_COUNTER)mypc_v;
+    increment_partitioned_counter(mypc, 3);
+    spinwait=1;
+    while (spinwait==1);
+    // mypc no longer points at a valid data structure.
+    return NULL;
+}
+
+static void do_testit2 (void) 
+// This test checks to see what happens if a thread is still live when we destruct a counter.
+//   A thread increments the counter, then lets us know through a spin wait, then waits until we destroy the counter.
+{
+    pthread_t t;
+    TOKU_VALGRIND_HG_DISABLE_CHECKING(&spinwait, sizeof(spinwait)); // this is a racy volatile variable.
+    {
+        PARTITIONED_COUNTER mypc = create_partitioned_counter();
+        increment_partitioned_counter(mypc, 1); // make sure that the long-lived thread also increments the partitioned counter, to test for #5321.
+        pt_create(&t, test2_fun, mypc);
+        while(spinwait==0); // wait until he incremented the counter.
+        increment_partitioned_counter(mypc, -1);
+        assert(read_partitioned_counter(mypc)==3);
+        destroy_partitioned_counter(mypc);
+    } // leave scope, so the counter goes away.
+    spinwait=2; // tell the other guy to finish up.
+    pt_join(t, NULL);
+}
+
+int test_main (int argc, const char *argv[]) {
+    parse_args(argc, argv);
+    if (time_cmdarg) {
+	do_timeit();
+    } else {
+	do_testit();
+        do_testit2();
+    }
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/test_partitioned_counter_5833.cc b/storage/tokudb/PerconaFT/util/tests/test_partitioned_counter_5833.cc
new file mode 100644
index 00000000..52060e6a
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/test_partitioned_counter_5833.cc
@@ -0,0 +1,102 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+// Demonstrate a race if #5833 isn't fixed.
+
+#include <pthread.h>
+#include <toku_portability.h>
+#include <util/partitioned_counter.h>
+#include "test.h"
+
+
+static void pt_create (pthread_t *thread, void *(*f)(void*), void *extra) {
+    int r = pthread_create(thread, NULL, f, extra);
+    assert(r==0);
+}
+
+static void pt_join (pthread_t thread, void *expect_extra) {
+    void *result;
+    int r = pthread_join(thread, &result);
+    assert(r==0);
+    assert(result==expect_extra);
+}
+
+static int verboseness_cmdarg=0;
+
+static void parse_args (int argc, const char *argv[]) {
+    const char *progname = argv[1];
+    argc--; argv++;
+    while (argc>0) {
+	if (strcmp(argv[0], "-v")==0) verboseness_cmdarg++;
+	else {
+	    printf("Usage: %s [-v]\n", progname);
+	    exit(1);
+	}
+	argc--; argv++;
+    }
+}
+
+#define NCOUNTERS 2
+PARTITIONED_COUNTER array_of_counters[NCOUNTERS];
+
+static void *counter_init_fun(void *tnum_pv) {
+    int *tnum_p = (int*)tnum_pv;
+    int tnum = *tnum_p;
+    assert(0<=tnum  && tnum<NCOUNTERS);
+    array_of_counters[tnum] = create_partitioned_counter();
+    return tnum_pv;
+}
+
+static void do_test_5833(void) {
+    pthread_t threads[NCOUNTERS];
+    int       tids[NCOUNTERS];
+    for (int i=0; i<NCOUNTERS; i++) {
+        tids[i] = i;
+        pt_create(&threads[i], counter_init_fun, &tids[i]);
+    }
+    for (int i=0; i<NCOUNTERS; i++) {
+        pt_join(threads[i], &tids[i]);
+        destroy_partitioned_counter(array_of_counters[i]);
+    }
+}
+
+int test_main(int argc, const char *argv[]) {
+    parse_args(argc, argv);
+    do_test_5833();
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/threadpool-nproc-limit.cc b/storage/tokudb/PerconaFT/util/tests/threadpool-nproc-limit.cc
new file mode 100644
index 00000000..d645a3a6
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/threadpool-nproc-limit.cc
@@ -0,0 +1,119 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+// this test verifies that the toku thread pool is resilient when hitting the nproc limit.
+
+#include <util/threadpool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/resource.h>
+
+int verbose = 0;
+
+static int usage(void) {
+    fprintf(stderr, "[-q] [-v] [--verbose] (%d)\n", verbose);
+    return 1;
+}
+
+static void *f(void *arg) {
+    return arg;
+}
+
+static int dotest(int the_limit) {
+    if (verbose)
+        fprintf(stderr, "%s:%u %d\n", __FILE__, __LINE__, the_limit);
+    int r;
+    struct toku_thread_pool *pool = nullptr;
+    r = toku_thread_pool_create(&pool, 10);
+    assert(r == 0 && pool != nullptr);
+
+    struct rlimit current_nproc_limit;
+    r = getrlimit(RLIMIT_NPROC, &current_nproc_limit);
+    assert(r == 0);
+    
+    struct rlimit new_nproc_limit = current_nproc_limit;
+    new_nproc_limit.rlim_cur = the_limit;
+    r = setrlimit(RLIMIT_NPROC, &new_nproc_limit);
+    assert(r == 0);
+
+    int want_n = 20;
+    int got_n = want_n;
+    r = toku_thread_pool_run(pool, 0, &got_n, f, nullptr);
+    if (r == 0)
+        assert(want_n == got_n);
+    else {
+        assert(r == EWOULDBLOCK);
+        assert(got_n <= want_n);
+    }
+
+    r = setrlimit(RLIMIT_NPROC, &current_nproc_limit);
+    assert(r == 0);
+
+    if (verbose)
+        toku_thread_pool_print(pool, stderr);
+    toku_thread_pool_destroy(&pool);
+    return got_n > 0;
+}
+
+int main(int argc, char *argv[]) {
+    // parse args
+    for (int i = 1; i < argc; i++) {
+        char *arg = argv[i];
+        if (arg[0] != '-')
+            break;
+        if (strcmp(arg, "-v") == 0 || strcmp(arg, "--verbose") == 0) {
+            verbose = verbose+1;
+            continue;
+        }
+        if (strcmp(arg, "-q") == 0) {
+            verbose = verbose > 0 ? verbose-1 : 0;
+            continue;
+        }
+        return usage();
+    }
+    // set increasing nproc limits until the test succeeds in hitting the limit after > 0 threads are created
+    for (int i = 0; 1; i++) {
+        if (dotest(i))
+            break;
+    }
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/threadpool-test.cc b/storage/tokudb/PerconaFT/util/tests/threadpool-test.cc
new file mode 100644
index 00000000..83c142ed
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/threadpool-test.cc
@@ -0,0 +1,170 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "test.h"
+#include <util/threadpool.h>
+
+#include <memory.h>
+#include <toku_os.h>
+#include <toku_portability.h>
+#include <portability/toku_pthread.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <string.h>
+#include <errno.h>
+#if defined(HAVE_MALLOC_H)
+# include <malloc.h>
+#elif defined(HAVE_SYS_MALLOC_H)
+# include <sys/malloc.h>
+#endif
+
+struct my_threadpool {
+    THREADPOOL threadpool;
+    toku_mutex_t mutex;
+    toku_cond_t wait;
+    int closed;
+    int counter;
+};
+
+static void
+my_threadpool_init (struct my_threadpool *my_threadpool, int max_threads) {
+    int r;
+    r = toku_thread_pool_create(&my_threadpool->threadpool, max_threads);
+    assert(r == 0);
+    assert(my_threadpool != 0);
+    toku_mutex_init(toku_uninstrumented, &my_threadpool->mutex, nullptr);
+    toku_cond_init(toku_uninstrumented, &my_threadpool->wait, nullptr);
+    my_threadpool->closed = 0;
+    my_threadpool->counter = 0;
+}
+
+static void
+my_threadpool_destroy (struct my_threadpool *my_threadpool, int max_threads) {
+    toku_mutex_lock(&my_threadpool->mutex);
+    my_threadpool->closed = 1;
+    toku_cond_broadcast(&my_threadpool->wait);
+    toku_mutex_unlock(&my_threadpool->mutex);
+
+    if (verbose) printf("current %d\n", toku_thread_pool_get_current_threads(my_threadpool->threadpool));
+    toku_thread_pool_destroy(&my_threadpool->threadpool); assert(my_threadpool->threadpool == 0);
+    assert(my_threadpool->counter == max_threads);
+    toku_mutex_destroy(&my_threadpool->mutex);
+    toku_cond_destroy(&my_threadpool->wait);
+}
+
+static void *
+my_thread_f (void *arg) {
+    struct my_threadpool *CAST_FROM_VOIDP(my_threadpool, arg);
+    toku_mutex_lock(&my_threadpool->mutex);
+    my_threadpool->counter++;
+    while (!my_threadpool->closed) {
+        toku_cond_wait(&my_threadpool->wait, &my_threadpool->mutex);
+    }
+    toku_mutex_unlock(&my_threadpool->mutex);
+    if (verbose) printf("%lu:%s:exit\n", (unsigned long)toku_os_gettid(), __FUNCTION__); 
+    return arg;
+}
+
+static void *my_malloc_always_fails(size_t n UU()) {
+    errno = ENOMEM;
+    return NULL;
+}
+
+static int
+usage (void) {
+    printf("threadpool-test: [-v] [-malloc-fail] [N]\n");
+    printf("-malloc-fail     simulate malloc failures\n");
+    printf("N                max number of threads in the thread pool\n");
+    return 1;
+}
+
+int
+test_main (int argc, const char *argv[]) {
+    int max_threads = 1;
+    int do_malloc_fail = 0;
+
+    int i;
+    for (i=1; i<argc; i++) {
+        const char *arg = argv[i];
+        if (strcmp(arg, "-h") == 0 || strcmp(arg, "-help") == 0) {
+            return usage();
+        } else if (strcmp(arg, "-v") == 0) {
+            verbose++;
+            continue;
+        } else if (strcmp(arg, "-q") == 0) {
+            verbose = 0;
+            continue;
+        } else if (strcmp(arg, "-malloc-fail") == 0) {
+            do_malloc_fail = 1;
+            continue;
+        } else
+            max_threads = atoi(arg);
+    }
+
+    struct my_threadpool my_threadpool;
+    THREADPOOL threadpool;
+
+    ZERO_STRUCT(my_threadpool);
+    my_threadpool_init(&my_threadpool, max_threads);
+    threadpool = my_threadpool.threadpool;
+    if (verbose) printf("test threadpool_set_busy\n");
+    for (i=0; i<2*max_threads; i++) {
+        assert(toku_thread_pool_get_current_threads(threadpool) == (i >= max_threads ? max_threads : i));
+        int n = 1;
+        toku_thread_pool_run(threadpool, 0, &n, my_thread_f, &my_threadpool);
+    }
+    assert(toku_thread_pool_get_current_threads(threadpool) == max_threads);
+    my_threadpool_destroy(&my_threadpool, max_threads);
+    
+    if (do_malloc_fail) {
+        if (verbose) printf("test threadpool_create with malloc failure\n");
+        // test threadpool malloc fails causes ENOMEM
+
+        toku_set_func_malloc(my_malloc_always_fails);
+        int r;
+        threadpool = NULL;
+        r = toku_thread_pool_create(&threadpool, 0); assert(r == ENOMEM);
+        r = toku_thread_pool_create(&threadpool, 1); assert(r == ENOMEM);
+        toku_set_func_malloc(NULL);
+    }
+
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/threadpool-testrunf.cc b/storage/tokudb/PerconaFT/util/tests/threadpool-testrunf.cc
new file mode 100644
index 00000000..4db38c73
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/threadpool-testrunf.cc
@@ -0,0 +1,114 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <util/threadpool.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+
+int verbose = 0;
+
+static int usage(int poolsize) {
+    fprintf(stderr, "[-q] [-v] [--verbose] (%d)\n", verbose);
+    fprintf(stderr, "[--poolsize %d]\n", poolsize);
+    return 1;
+}
+
+static void *f(void *arg) {
+    return arg;
+}
+
+static void dotest(int poolsize, int nloops) {
+    int r;
+    struct toku_thread_pool *pool = NULL;
+    r = toku_thread_pool_create(&pool, poolsize);
+    assert(r == 0 && pool != NULL);
+
+    int i;
+    for (i = 0; i < nloops; i++) {
+        int n = 1;
+        r = toku_thread_pool_run(pool, 1, &n, f, NULL);
+        assert(r == 0);
+    }
+
+    if (verbose)
+        toku_thread_pool_print(pool, stderr);
+    toku_thread_pool_destroy(&pool);
+}
+
+int main(int argc, char *argv[]) {
+    // defaults
+    int poolsize = 1;
+    int nloops = 100000;
+
+    // options
+    int i;
+    for (i = 1; i < argc; i++) {
+        char *arg = argv[i];
+        if (arg[0] != '-')
+            break;
+        if (strcmp(arg, "--poolsize") == 0 && i+1 < argc) {
+            poolsize = atoi(argv[++i]);
+            continue;
+        }
+        if (strcmp(arg, "-v") == 0 || strcmp(arg, "--verbose") == 0) {
+            verbose = verbose+1;
+            continue;
+        }
+        if (strcmp(arg, "-q") == 0) {
+            verbose = verbose > 0 ? verbose-1 : 0;
+            continue;
+        }
+
+        return usage(poolsize);
+    }
+    int starti = i;
+
+    if (starti == argc) {
+        dotest(poolsize, nloops);
+    } else {
+        for (i = starti; i < argc; i++) {
+            nloops = atoi(argv[i]);
+            dotest(poolsize, nloops);
+        }
+    }
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/tests/x1764-test.cc b/storage/tokudb/PerconaFT/util/tests/x1764-test.cc
new file mode 100644
index 00000000..76b1d9c7
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/tests/x1764-test.cc
@@ -0,0 +1,139 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "test.h"
+#include <util/x1764.h>
+
+static void
+test0 (void) {
+    uint32_t c = toku_x1764_memory("", 0);
+    assert(c==~(0U));
+    struct x1764 cs;
+    toku_x1764_init(&cs);
+    toku_x1764_add(&cs, "", 0);
+    c = toku_x1764_finish(&cs);
+    assert(c==~(0U));
+}
+
+static void
+test1 (void) {
+    uint64_t v=0x123456789abcdef0ULL;
+    uint32_t c;
+    int i;
+    for (i=0; i<=8; i++) {
+	uint64_t expect64 = (i==8) ? v : v&((1LL<<(8*i))-1);
+	uint32_t expect = expect64 ^ (expect64>>32);
+	c = toku_x1764_memory(&v, i);
+	//printf("i=%d c=%08x expect=%08x\n", i, c, expect);
+	assert(c==~expect);
+    }
+}
+
+// Compute checksums incrementally, using various strides
+static void
+test2 (void) {
+    enum { N=200 };
+    char v[N];
+    int i;
+    for (i=0; i<N; i++) v[i]=(char)random();
+    for (i=0; i<N; i++) {
+	int j;
+	for (j=i; j<=N; j++) {
+	    // checksum from i (inclusive to j (exclusive)
+	    uint32_t c = toku_x1764_memory(&v[i], j-i);
+	    // Now compute the checksum incrementally with various strides.
+	    int stride;
+	    for (stride=1; stride<=j-i; stride++) {
+		int k;
+		struct x1764 s;
+		toku_x1764_init(&s);
+		for (k=i; k+stride<=j; k+=stride) {
+		    toku_x1764_add(&s, &v[k], stride);
+		}
+		toku_x1764_add(&s, &v[k], j-k);
+		uint32_t c2 = toku_x1764_finish(&s);
+		assert(c2==c);
+	    }
+	    // Now use some random strides.
+	    {
+		int k=i;
+		struct x1764 s;
+		toku_x1764_init(&s);
+		while (1) {
+		    stride=random()%16;
+		    if (k+stride>j) break;
+		    toku_x1764_add(&s, &v[k], stride);
+		    k+=stride;
+		}
+		toku_x1764_add(&s, &v[k], j-k);
+		uint32_t c2 = toku_x1764_finish(&s);
+		assert(c2==c);
+	    }
+	}
+    }
+}
+
+static void
+test3 (void)
+// Compare the simple version to the highly optimized version.
+{
+    const int datalen = 1000;
+    char data[datalen];
+    for (int i=0; i<datalen; i++) data[i]=random();
+    for (int off=0; off<32; off++) {
+	if (verbose) {printf("."); fflush(stdout);}
+	for (int len=0; len+off<datalen; len++) {
+	    uint32_t reference_sum = toku_x1764_memory_simple(data+off, len);
+	    uint32_t fast_sum      = toku_x1764_memory       (data+off, len);
+	    assert(reference_sum==fast_sum);
+	}
+    }
+}
+
+int
+test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+    if (verbose) printf("0\n");
+    test0();
+    if (verbose) printf("1\n");
+    test1();
+    if (verbose) printf("2\n");
+    test2();
+    if (verbose) printf("3\n");
+    test3();
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/util/threadpool.cc b/storage/tokudb/PerconaFT/util/threadpool.cc
new file mode 100644
index 00000000..6e0ccf05
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/threadpool.cc
@@ -0,0 +1,298 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <memory.h>
+#include <toku_portability.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <toku_assert.h>
+#include <toku_list.h>
+#include <portability/toku_pthread.h>
+
+#include "threadpool.h"
+
+toku_instr_key *tpool_lock_mutex_key;
+toku_instr_key *tp_thread_wait_key;
+toku_instr_key *tp_pool_wait_free_key;
+toku_instr_key *tp_internal_thread_key;
+
+struct toku_thread {
+    struct toku_thread_pool *pool;
+    toku_pthread_t tid;
+    void *(*f)(void *arg);
+    void *arg;
+    int doexit;
+    struct toku_list free_link;
+    struct toku_list all_link;
+    toku_cond_t wait;
+};
+
+struct toku_thread_pool {
+    int max_threads;
+    int cur_threads;
+    struct toku_list free_threads;
+    struct toku_list all_threads;
+
+    toku_mutex_t lock;
+    toku_cond_t wait_free;
+    
+    uint64_t gets, get_blocks;
+};
+
+static void *toku_thread_run_internal(void *arg);
+static void toku_thread_pool_lock(struct toku_thread_pool *pool);
+static void toku_thread_pool_unlock(struct toku_thread_pool *pool);
+
+static int 
+toku_thread_create(struct toku_thread_pool *pool, struct toku_thread **toku_thread_return) {
+    int r;
+    struct toku_thread *MALLOC(thread);
+    if (thread == nullptr) {
+        r = get_error_errno();
+    } else {
+        memset(thread, 0, sizeof *thread);
+        thread->pool = pool;
+        toku_cond_init(*tp_thread_wait_key, &thread->wait, nullptr);
+        r = toku_pthread_create(*tp_internal_thread_key,
+                                &thread->tid,
+                                nullptr,
+                                toku_thread_run_internal,
+                                thread);
+        if (r) {
+            toku_cond_destroy(&thread->wait);
+            toku_free(thread);
+            thread = nullptr;
+        }
+        *toku_thread_return = thread;
+    }
+    return r;
+}
+
+void 
+toku_thread_run(struct toku_thread *thread, void *(*f)(void *arg), void *arg) {
+    toku_thread_pool_lock(thread->pool);
+    thread->f = f;
+    thread->arg = arg;
+    toku_cond_signal(&thread->wait);
+    toku_thread_pool_unlock(thread->pool);
+}
+
+static void toku_thread_destroy(struct toku_thread *thread) {
+    int r;
+    void *ret;
+    r = toku_pthread_join(thread->tid, &ret);
+    invariant(r == 0 && ret == thread);
+    struct toku_thread_pool *pool = thread->pool;
+    toku_thread_pool_lock(pool);
+    toku_list_remove(&thread->free_link);
+    toku_thread_pool_unlock(pool);
+    toku_cond_destroy(&thread->wait);
+    toku_free(thread);
+}
+
+static void 
+toku_thread_ask_exit(struct toku_thread *thread) {
+    thread->doexit = 1;
+    toku_cond_signal(&thread->wait);
+}
+
+static void *
+toku_thread_run_internal(void *arg) {
+    struct toku_thread *thread = (struct toku_thread *) arg;
+    struct toku_thread_pool *pool = thread->pool;
+    toku_thread_pool_lock(pool);
+    while (1) {
+        toku_cond_signal(&pool->wait_free);
+        void *(*thread_f)(void *); void *thread_arg; int doexit;
+        while (1) {
+            thread_f = thread->f; thread_arg = thread->arg; doexit = thread->doexit; // make copies of these variables to make helgrind happy
+            if (thread_f || doexit) 
+                break;
+            toku_cond_wait(&thread->wait, &pool->lock);
+        }
+        toku_thread_pool_unlock(pool);
+        if (thread_f)
+            (void) thread_f(thread_arg);
+        if (doexit)
+            break;
+        toku_thread_pool_lock(pool);
+        thread->f = nullptr;
+        toku_list_push(&pool->free_threads, &thread->free_link);
+    }
+    return toku_pthread_done(arg);
+}
+
+int toku_thread_pool_create(struct toku_thread_pool **pool_return,
+                            int max_threads) {
+    int r;
+    struct toku_thread_pool *CALLOC(pool);
+    if (pool == nullptr) {
+        r = get_error_errno();
+    } else {
+        toku_mutex_init(*tpool_lock_mutex_key, &pool->lock, nullptr);
+        toku_list_init(&pool->free_threads);
+        toku_list_init(&pool->all_threads);
+        toku_cond_init(*tp_pool_wait_free_key, &pool->wait_free, nullptr);
+        pool->cur_threads = 0;
+        pool->max_threads = max_threads;
+        *pool_return = pool;
+        r = 0;
+    }
+    return r;
+}    
+
+static void 
+toku_thread_pool_lock(struct toku_thread_pool *pool) {
+    toku_mutex_lock(&pool->lock);
+}
+
+static void 
+toku_thread_pool_unlock(struct toku_thread_pool *pool) {
+    toku_mutex_unlock(&pool->lock);
+}
+
+void 
+toku_thread_pool_destroy(struct toku_thread_pool **poolptr) {
+    struct toku_thread_pool *pool = *poolptr;
+    *poolptr = nullptr;
+
+    // ask the threads to exit
+    toku_thread_pool_lock(pool);
+    struct toku_list *list;
+    for (list = pool->all_threads.next; list != &pool->all_threads; list = list->next) {
+        struct toku_thread *thread = toku_list_struct(list, struct toku_thread, all_link);
+        toku_thread_ask_exit(thread);
+    }
+    toku_thread_pool_unlock(pool);
+
+    // wait for all of the threads to exit
+    while (!toku_list_empty(&pool->all_threads)) {
+        list = toku_list_pop_head(&pool->all_threads);
+        struct toku_thread *thread = toku_list_struct(list, struct toku_thread, all_link);
+        toku_thread_destroy(thread);
+        pool->cur_threads -= 1;
+    }
+
+    invariant(pool->cur_threads == 0);
+    
+    // cleanup
+    toku_cond_destroy(&pool->wait_free);
+    toku_mutex_destroy(&pool->lock);
+    
+    toku_free(pool);
+}
+
+static int 
+toku_thread_pool_add(struct toku_thread_pool *pool) {
+    struct toku_thread *thread = nullptr;
+    int r = toku_thread_create(pool, &thread); 
+    if (r == 0) {
+        pool->cur_threads += 1;
+        toku_list_push(&pool->all_threads, &thread->all_link);
+        toku_list_push(&pool->free_threads, &thread->free_link);
+        toku_cond_signal(&pool->wait_free);
+    }
+    return r;
+}   
+
+// get one thread from the free pool.  
+static int 
+toku_thread_pool_get_one(struct toku_thread_pool *pool, int dowait, struct toku_thread **toku_thread_return) {
+    int r = 0;
+    toku_thread_pool_lock(pool);
+    pool->gets++;
+    while (1) {
+        if (!toku_list_empty(&pool->free_threads))
+            break;
+        if (pool->max_threads == 0 || pool->cur_threads < pool->max_threads)
+            (void) toku_thread_pool_add(pool);
+        if (toku_list_empty(&pool->free_threads) && !dowait) {
+            r = EWOULDBLOCK;
+            break;
+        }
+        pool->get_blocks++;
+        toku_cond_wait(&pool->wait_free, &pool->lock);
+    }
+    if (r == 0) {
+        struct toku_list *list = toku_list_pop_head(&pool->free_threads);
+        struct toku_thread *thread = toku_list_struct(list, struct toku_thread, free_link);
+        *toku_thread_return = thread;
+    } else
+        *toku_thread_return = nullptr;
+    toku_thread_pool_unlock(pool);
+    return r;
+}
+
+int 
+toku_thread_pool_get(struct toku_thread_pool *pool, int dowait, int *nthreads, struct toku_thread **toku_thread_return) {
+    int r = 0;
+    int n = *nthreads;
+    int i;
+    for (i = 0; i < n; i++) {
+        r = toku_thread_pool_get_one(pool, dowait, &toku_thread_return[i]);
+        if (r != 0)
+            break;
+    }
+    *nthreads = i;
+    return r;
+}
+
+int 
+toku_thread_pool_run(struct toku_thread_pool *pool, int dowait, int *nthreads, void *(*f)(void *arg), void *arg) {
+    int n = *nthreads;
+    struct toku_thread *tids[n];
+    int r = toku_thread_pool_get(pool, dowait, nthreads, tids);
+    if (r == 0 || r == EWOULDBLOCK) {
+        n = *nthreads;
+        for (int i = 0; i < n; i++)
+            toku_thread_run(tids[i], f, arg);
+    }
+    return r;
+}
+
+void 
+toku_thread_pool_print(struct toku_thread_pool *pool, FILE *out) {
+    fprintf(out, "%s:%d %p %llu %llu\n", __FILE__, __LINE__, pool, (long long unsigned) pool->gets, (long long unsigned) pool->get_blocks);
+}
+
+int 
+toku_thread_pool_get_current_threads(struct toku_thread_pool *pool) {
+    return pool->cur_threads;
+}
diff --git a/storage/tokudb/PerconaFT/util/threadpool.h b/storage/tokudb/PerconaFT/util/threadpool.h
new file mode 100644
index 00000000..eba239f8
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/threadpool.h
@@ -0,0 +1,85 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdio.h>
+
+// A toku_thread is toku_pthread that can be cached.
+struct toku_thread;
+
+// Run a function f on a thread
+// This function setups up the thread to run function f with argument arg and then wakes up
+// the thread to run it.
+void toku_thread_run(struct toku_thread *thread, void *(*f)(void *arg), void *arg);
+
+// A toku_thread_pool is a pool of toku_threads.  These threads can be allocated from the pool
+// and can run an arbitrary function.
+struct toku_thread_pool;
+
+typedef struct toku_thread_pool *THREADPOOL;
+
+// Create a new threadpool
+// Effects: a new threadpool is allocated and initialized. the number of threads in the threadpool is limited to max_threads.  
+// If max_threads == 0 then there is no limit on the number of threads in the pool.
+// Initially, there are no threads in the pool. Threads are allocated by the _get or _run functions.
+// Returns: if there are no errors, the threadpool is set and zero is returned.  Otherwise, an error number is returned.
+int toku_thread_pool_create(struct toku_thread_pool **threadpoolptr, int max_threads);
+
+// Destroy a threadpool
+// Effects: the calling thread joins with all of the threads in the threadpool.
+// Effects: the threadpool memory is freed.
+// Returns: the threadpool is set to null.
+void toku_thread_pool_destroy(struct toku_thread_pool **threadpoolptr);
+
+// Get the current number of threads in the thread pool
+int toku_thread_pool_get_current_threads(struct toku_thread_pool *pool);
+
+// Get one or more threads from the thread pool
+// dowait indicates whether or not the caller blocks waiting for threads to free up
+// nthreads on input determines the number of threads that are wanted
+// nthreads on output indicates the number of threads that were allocated
+// toku_thread_return on input supplies an array of thread pointers (all NULL).  This function returns the threads
+// that were allocated in the array.
+int toku_thread_pool_get(struct toku_thread_pool *pool, int dowait, int *nthreads, struct toku_thread **toku_thread_return);
+
+// Run a function f on one or more threads allocated from the thread pool
+int toku_thread_pool_run(struct toku_thread_pool *pool, int dowait, int *nthreads, void *(*f)(void *arg), void *arg);
+
+// Print the state of the thread pool
+void toku_thread_pool_print(struct toku_thread_pool *pool, FILE *out);
diff --git a/storage/tokudb/PerconaFT/util/x1764.cc b/storage/tokudb/PerconaFT/util/x1764.cc
new file mode 100644
index 00000000..22f02cf9
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/x1764.cc
@@ -0,0 +1,244 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <toku_stdlib.h>
+#include <portability/toku_portability.h>
+
+#include "x1764.h"
+
+#define PRINT 0
+
+uint32_t toku_x1764_memory_simple (const void *buf, int len)
+{
+    const uint64_t *CAST_FROM_VOIDP(lbuf, buf);
+    uint64_t c=0;
+    while (len>=8) {
+	c = c*17 + *lbuf;
+	if (PRINT) printf("%d: c=%016" PRIx64 " sum=%016" PRIx64 "\n", __LINE__, *lbuf, c);
+	lbuf++;
+	len-=8;
+    }
+    if (len>0) {
+	const uint8_t *cbuf=(uint8_t*)lbuf;
+	int i;
+	uint64_t input=0;
+	for (i=0; i<len; i++) {
+	    input |= ((uint64_t)(cbuf[i]))<<(8*i);
+	}
+	c = c*17 + input;
+    }
+    return ~((c&0xFFFFFFFF) ^ (c>>32));
+}
+
+uint32_t toku_x1764_memory (const void *vbuf, int len)
+{
+    const uint8_t *CAST_FROM_VOIDP(buf, vbuf);
+    int len_4_words = 4*sizeof(uint64_t);
+    uint64_t suma=0, sumb=0, sumc=0, sumd=0;
+    while (len >= len_4_words) {
+	suma = suma*(17LL*17LL*17LL*17LL) + *(uint64_t*)(buf +0*sizeof(uint64_t));
+	sumb = sumb*(17LL*17LL*17LL*17LL) + *(uint64_t*)(buf +1*sizeof(uint64_t));
+	sumc = sumc*(17LL*17LL*17LL*17LL) + *(uint64_t*)(buf +2*sizeof(uint64_t));
+	sumd = sumd*(17LL*17LL*17LL*17LL) + *(uint64_t*)(buf +3*sizeof(uint64_t));
+	buf += len_4_words;
+	len -= len_4_words;
+    }
+    uint64_t sum = suma*17L*17L*17L + sumb*17L*17L + sumc*17L + sumd;
+    assert(len>=0);
+    while ((uint64_t)len>=sizeof(uint64_t)) {
+	sum = sum*17 + *(uint64_t*)buf;
+	buf+=sizeof(uint64_t);
+	len-=sizeof(uint64_t);
+    }
+    if (len>0) {
+	uint64_t tailsum = 0;
+	for (int i=0; i<len; i++) {
+	    tailsum |= ((uint64_t)(buf[i]))<<(8*i);
+	}
+	sum = sum*17 + tailsum;
+    }
+    return ~((sum&0xFFFFFFFF) ^ (sum>>32));
+}
+
+
+void toku_x1764_init(struct x1764 *l) {
+    l->sum=0;
+    l->input=0;
+    l->n_input_bytes=0;
+}
+
+void toku_x1764_add (struct x1764 *l, const void *vbuf, int len) {
+    if (PRINT) printf("%d: n_input_bytes=%d len=%d\n", __LINE__, l->n_input_bytes, len);
+    int n_input_bytes = l->n_input_bytes;
+    const unsigned char *CAST_FROM_VOIDP(cbuf, vbuf);
+    // Special case short inputs
+    if (len==1) {
+	uint64_t input = l->input | ((uint64_t)(*cbuf))<<(8*n_input_bytes);
+	n_input_bytes++;
+	if (n_input_bytes==8) {
+	    l->sum = l->sum*17 + input;
+	    l->n_input_bytes = 0;
+	    l->input = 0;
+	} else {
+	    l->input = input;
+	    l->n_input_bytes = n_input_bytes;
+	}
+	return;
+    } else if (len==2) {
+	uint64_t input = l->input;
+	uint64_t thisv = ((uint64_t)(*(uint16_t*)cbuf));
+	if (n_input_bytes==7) {
+	    l->sum = l->sum*17 + (input | (thisv<<(8*7)));
+	    l->input = thisv>>8;
+	    l->n_input_bytes = 1;
+	} else if (n_input_bytes==6) {
+	    l->sum = l->sum*17 + (input | (thisv<<(8*6)));
+	    l->input = 0;
+	    l->n_input_bytes = 0;
+	} else {
+	    l->input = input | (thisv<<(8*n_input_bytes));
+	    l->n_input_bytes += 2;
+	}
+	return;
+    }
+
+    uint64_t sum;
+    //assert(len>=0);
+    if (n_input_bytes) {
+	uint64_t input = l->input;
+	if (len>=8) {
+	    sum = l->sum;
+	    while (len>=8) {
+		uint64_t thisv = *(uint64_t*)cbuf;
+		input |= thisv<<(8*n_input_bytes);
+		sum = sum*17 + input;
+		if (PRINT) printf("%d: input=%016" PRIx64 " sum=%016" PRIx64 "\n", __LINE__, input, sum);
+		input = thisv>>(8*(8-n_input_bytes));
+		if (PRINT) printf("%d: input=%016" PRIx64 "\n", __LINE__, input);
+		len-=8;
+		cbuf+=8;
+		// n_input_bytes remains unchanged
+		if (PRINT) printf("%d: n_input_bytes=%d len=%d\n", __LINE__, l->n_input_bytes, len);
+	    }
+	    l->sum = sum;
+	}
+	if (len>=4) {
+	    uint64_t thisv = *(uint32_t*)cbuf;
+	    if (n_input_bytes<4) {
+		input |= thisv<<(8*n_input_bytes);
+		if (PRINT) printf("%d: input=%016" PRIx64 "\n", __LINE__, input);
+		n_input_bytes+=4;
+	    } else {
+		input |= thisv<<(8*n_input_bytes);
+		l->sum = l->sum*17 + input;
+		if (PRINT) printf("%d: input=%016" PRIx64 " sum=%016" PRIx64 "\n", __LINE__, input, l->sum);
+		input = thisv>>(8*(8-n_input_bytes));
+		n_input_bytes-=4;
+		if (PRINT) printf("%d: input=%016" PRIx64 " n_input_bytes=%d\n", __LINE__, input, n_input_bytes);
+	    }
+	    len-=4;
+	    cbuf+=4;
+	    if (PRINT) printf("%d: len=%d\n", __LINE__, len);
+	}
+	//assert(n_input_bytes<=8);
+	while (n_input_bytes<8 && len) {
+	    input |= ((uint64_t)(*cbuf))<<(8*n_input_bytes);
+	    n_input_bytes++;
+	    cbuf++;
+	    len--;
+	}
+	//assert(len>=0);
+	if (n_input_bytes<8) {
+	    //assert(len==0);
+	    l->input = input;
+	    l->n_input_bytes = n_input_bytes;
+	    if (PRINT) printf("%d: n_input_bytes=%d\n", __LINE__, l->n_input_bytes);
+	    return;
+	}
+	sum = l->sum*17 + input;
+    } else {
+	//assert(len>=0);
+	sum = l->sum;
+    }
+    //assert(len>=0);
+    while (len>=8) {
+	sum = sum*17 + *(uint64_t*)cbuf;
+	cbuf+=8;
+	len -=8;
+    }
+    l->sum = sum;
+    n_input_bytes = 0;
+    uint64_t input;
+    l->n_input_bytes = len;
+    // Surprisingly, the loop is the fastest on bradley's laptop.
+    if (1) {
+	int i;
+	input=0;
+	for (i=0; i<len; i++) {
+	    input |= ((uint64_t)(cbuf[i]))<<(8*i);
+	}
+    } else if (0) {
+	switch (len) {
+	case 7: input = ((uint64_t)(*(uint32_t*)(cbuf))) | (((uint64_t)(*(uint16_t*)(cbuf+4)))<<32) | (((uint64_t)(*(cbuf+4)))<<48); break;
+	case 6: input = ((uint64_t)(*(uint32_t*)(cbuf))) | (((uint64_t)(*(uint16_t*)(cbuf+4)))<<32); break;
+	case 5: input = ((uint64_t)(*(uint32_t*)(cbuf))) | (((uint64_t)(*(cbuf+4)))<<32); break;
+	case 4: input = ((uint64_t)(*(uint32_t*)(cbuf))); break;
+	case 3: input = ((uint64_t)(*(uint16_t*)(cbuf))) | (((uint64_t)(*(cbuf+2)))<<16); break;
+	case 2: input = ((uint64_t)(*(uint16_t*)(cbuf))); break;
+	case 1: input = ((uint64_t)(*cbuf)); break;
+	case 0: input = 0;                      break;
+	default: abort();
+	}
+    } else {
+	input=0;
+	int i=0;
+	if (len>=4) { input  = ((uint64_t)(*(uint32_t*)(cbuf)));        cbuf+=4; len-=4; i=4;}
+	if (len>=2) { input |= ((uint64_t)(*(uint16_t*)(cbuf)))<<(i*8); cbuf+=2; len-=2; i+=2; }
+	if (len>=1) { input |= ((uint64_t)(*(uint8_t *)(cbuf)))<<(i*8); /*cbuf+=1; len-=1; i++;*/ }
+    }
+    l->input = input;
+    if (PRINT) printf("%d: n_input_bytes=%d\n", __LINE__, l->n_input_bytes);
+}
+uint32_t toku_x1764_finish (struct x1764 *l) {
+    if (PRINT) printf("%d: n_input_bytes=%d\n", __LINE__, l->n_input_bytes);
+    int len = l->n_input_bytes;
+    if (len>0) {
+	l->sum = l->sum*17 + l->input;
+    }
+    return ~((l->sum &0xffffffff) ^ (l->sum>>32));
+}
diff --git a/storage/tokudb/PerconaFT/util/x1764.h b/storage/tokudb/PerconaFT/util/x1764.h
new file mode 100644
index 00000000..41302658
--- /dev/null
+++ b/storage/tokudb/PerconaFT/util/x1764.h
@@ -0,0 +1,70 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <toku_stdint.h>
+
+// The x1764 hash is
+//   $s = \sum_i a_i*17^i$  where $a_i$ is the $i$th 64-bit number (represented in little-endian format)
+// The final 32-bit result is the xor of the high- and low-order bits of s.
+// If any odd bytes numbers are left at the end, they are filled in at the low end.
+
+
+uint32_t toku_x1764_memory (const void *buf, int len);
+// Effect: Compute x1764 on the bytes of buf.  Return the 32 bit answer.
+
+uint32_t toku_x1764_memory_simple (const void *buf, int len);
+// Effect: Same as toku_x1764_memory, but not highly optimized (more likely to be correct).  Useful for testing the optimized version.
+
+
+// For incrementally computing an x1764, use the following interfaces.
+struct x1764 {
+    uint64_t sum;
+    uint64_t input;
+    int n_input_bytes;
+};
+
+void toku_x1764_init(struct x1764 *l);
+// Effect: Initialize *l.
+
+void toku_x1764_add (struct x1764 *l, const void *vbuf, int len);
+// Effect: Add more bytes to *l.
+
+uint32_t toku_x1764_finish (struct x1764 *l);
+// Effect: Return the final 32-bit result.
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:07:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:07:14 +0000
commit	a175314c3e5827eb193872241446f2f8f5c9d33c (patch)
tree	cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/tokudb/PerconaFT/util
parent	Initial commit. (diff)
download	mariadb-10.5-upstream/1%10.5.12.tar.xz mariadb-10.5-upstream/1%10.5.12.zip