1 files changed, 607 insertions, 0 deletions
diff --git a/storage/tokudb/PerconaFT/ft/cachetable/cachetable-internal.h b/storage/tokudb/PerconaFT/ft/cachetable/cachetable-internal.h
new file mode 100644
index 00000000..05fb771d
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/cachetable/cachetable-internal.h
@@ -0,0 +1,607 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "cachetable/background_job_manager.h"
+#include <portability/toku_random.h>
+#include <util/frwlock.h>
+#include <util/kibbutz.h>
+#include <util/nb_mutex.h>
+#include <util/partitioned_counter.h>
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// This file contains the classes and structs that make up the cachetable.
+// The structs are:
+//  - cachefile
+//  - ctpair
+//  - pair_list
+//  - cachefile_list
+//  - checkpointer
+//  - evictor
+//  - cleaner
+//
+// The rest of this comment assumes familiarity with the locks used in these
+// classes/structs and what the locks protect. Nevertheless, here is 
+// a list of the locks that we have:
+//  - pair_list->list_lock
+//  - pair_list->pending_lock_expensive
+//  - pair_list->pending_lock_cheap
+//  - cachefile_list->lock
+//  - PAIR->mutex
+//  - PAIR->value_rwlock
+//  - PAIR->disk_nb_mutex
+//
+// Here are rules for how the locks interact:
+//  - To grab any of the pair_list's locks, or the cachefile_list's lock,
+//      the cachetable must be in existence
+//  - To grab the PAIR mutex, we must know the PAIR will not dissappear:
+//   - the PAIR must be pinned (value_rwlock or disk_nb_mutex is held)
+//   - OR, the pair_list's list lock is held
+//  - As a result, to get rid of a PAIR from the pair_list, we must hold
+//     both the pair_list's list_lock and the PAIR's mutex
+//  - To grab PAIR->value_rwlock, we must hold the PAIR's mutex
+//  - To grab PAIR->disk_nb_mutex, we must hold the PAIR's mutex
+//      and hold PAIR->value_rwlock
+//
+// Now let's talk about ordering. Here is an order from outer to inner (top locks must be grabbed first)
+//  - pair_list->pending_lock_expensive
+//  - pair_list->list_lock
+//  - cachefile_list->lock
+//  - PAIR->mutex
+//  - pair_list->pending_lock_cheap <-- after grabbing this lock, 
+//                                      NO other locks 
+//                                      should be grabbed.
+//  - when grabbing PAIR->value_rwlock or PAIR->disk_nb_mutex,
+//     if the acquisition will not block, then it does not matter if any other locks held,
+//     BUT if the acquisition will block, then NO other locks may be held besides
+//     PAIR->mutex.
+// 
+// HERE ARE TWO EXAMPLES:
+// To pin a PAIR on a client thread, the following must be done:
+//  - first grab the list lock and find the PAIR
+//  - with the list lock grabbed, grab PAIR->mutex
+//  - with PAIR->mutex held:
+//   - release list lock
+//   - pin PAIR
+//   - with PAIR pinned, grab pending_lock_cheap,
+//   - copy and clear PAIR->checkpoint_pending,
+//   - resolve checkpointing if necessary
+//   - return to user.
+//  The list lock may be held while pinning the PAIR if 
+//  the PAIR has no contention. Otherwise, we may have
+//  get a deadlock with another thread that has the PAIR pinned,
+//  tries to pin some other PAIR, and in doing so, grabs the list lock.
+//
+// To unpin a PAIR on a client thread:
+//  - because the PAIR is pinned, we don't need the pair_list's list_lock
+//  - so, simply acquire PAIR->mutex
+//  - unpin the PAIR
+//  - return
+//
+//////////////////////////////////////////////////////////////////////////////
+class evictor;
+class pair_list;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Maps to a file on disk.
+//
+struct cachefile {
+    // these next two fields are protected by cachetable's list lock
+    // they are managed whenever we add or remove a pair from
+    // the cachetable. As of Riddler, this linked list is only used to
+    // make cachetable_flush_cachefile more efficient
+    PAIR cf_head; // doubly linked list that is NOT circular
+    uint32_t num_pairs; // count on number of pairs in the cachetable belong to this cachefile
+
+    bool for_checkpoint; //True if part of the in-progress checkpoint
+
+    // If set and the cachefile closes, the file will be removed.
+    // Clients must not operate on the cachefile after setting this,
+    // nor attempt to open any cachefile with the same fname (dname)
+    // until this cachefile has been fully closed and unlinked.
+    bool unlink_on_close;
+    // If set then fclose will not be logged in recovery log.
+    bool skip_log_recover_on_close;
+    int fd;       /* Bug: If a file is opened read-only, then it is stuck in read-only.  If it is opened read-write, then subsequent writers can write to it too. */
+    CACHETABLE cachetable;
+    struct fileid fileid;
+    // the filenum is used as an identifer of the cachefile
+    // for logging and recovery
+    FILENUM filenum;
+    // number used to generate hashes for blocks in the cachefile
+    // used in toku_cachetable_hash
+    // this used to be the filenum.fileid, but now it is separate
+    uint32_t hash_id;
+    char *fname_in_env; /* Used for logging */
+
+    void *userdata;
+    void (*log_fassociate_during_checkpoint)(CACHEFILE cf, void *userdata); // When starting a checkpoint we must log all open files.
+    void (*close_userdata)(CACHEFILE cf, int fd, void *userdata, bool lsnvalid, LSN); // when closing the last reference to a cachefile, first call this function. 
+    void (*free_userdata)(CACHEFILE cf, void *userdata); // when closing the last reference to a cachefile, first call this function. 
+    void (*begin_checkpoint_userdata)(LSN lsn_of_checkpoint, void *userdata); // before checkpointing cachefiles call this function.
+    void (*checkpoint_userdata)(CACHEFILE cf, int fd, void *userdata); // when checkpointing a cachefile, call this function.
+    void (*end_checkpoint_userdata)(CACHEFILE cf, int fd, void *userdata); // after checkpointing cachefiles call this function.
+    void (*note_pin_by_checkpoint)(CACHEFILE cf, void *userdata); // add a reference to the userdata to prevent it from being removed from memory
+    void (*note_unpin_by_checkpoint)(CACHEFILE cf, void *userdata); // add a reference to the userdata to prevent it from being removed from memory
+    BACKGROUND_JOB_MANAGER bjm;
+};
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//  The pair represents the data stored in the cachetable.
+//
+struct ctpair {
+    // these fields are essentially constants. They do not change.
+    CACHEFILE cachefile;
+    CACHEKEY key;
+    uint32_t fullhash;
+    CACHETABLE_FLUSH_CALLBACK flush_callback;
+    CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK pe_est_callback;
+    CACHETABLE_PARTIAL_EVICTION_CALLBACK pe_callback;
+    CACHETABLE_CLEANER_CALLBACK cleaner_callback;
+    CACHETABLE_CLONE_CALLBACK clone_callback;
+    CACHETABLE_CHECKPOINT_COMPLETE_CALLBACK checkpoint_complete_callback;
+    void *write_extraargs;
+
+    // access to these fields are protected by disk_nb_mutex
+    void* cloned_value_data; // cloned copy of value_data used for checkpointing
+    long cloned_value_size; // size of cloned_value_data, used for accounting of size_current
+    void* disk_data; // data used to fetch/flush value_data to and from disk.
+
+    // access to these fields are protected by value_rwlock
+    void* value_data; // data used by client threads, FTNODEs and ROLLBACK_LOG_NODEs
+    PAIR_ATTR attr;
+    enum cachetable_dirty dirty;
+
+    // protected by PAIR->mutex
+    uint32_t count;        // clock count
+    uint32_t refcount; // if > 0, then this PAIR is referenced by
+                       // callers to the cachetable, and therefore cannot 
+                       // be evicted
+    uint32_t num_waiting_on_refs; // number of threads waiting on refcount to go to zero
+    toku_cond_t refcount_wait; // cond used to wait for refcount to go to zero
+
+    // locks
+    toku::frwlock value_rwlock;
+    struct nb_mutex disk_nb_mutex; // single writer, protects disk_data, is used for writing cloned nodes for checkpoint
+    toku_mutex_t* mutex; // gotten from the pair list
+
+    // Access to checkpoint_pending is protected by two mechanisms,
+    // the value_rwlock and the pair_list's pending locks (expensive and cheap).
+    // checkpoint_pending may be true of false. 
+    // Here are the rules for reading/modifying this bit.
+    //  - To transition this field from false to true during begin_checkpoint,
+    //   we must be holding both of the pair_list's pending locks.
+    //  - To transition this field from true to false during end_checkpoint,
+    //   we must be holding the value_rwlock.
+    //  - For a non-checkpoint thread to read the value, we must hold both the
+    //   value_rwlock and one of the pair_list's pending locks
+    //  - For the checkpoint thread to read the value, we must 
+    //   hold the value_rwlock
+    //
+    bool checkpoint_pending; // If this is on, then we have got to resolve checkpointing modifying it.
+
+    // these are variables that are only used to transfer information to background threads
+    // we cache them here to avoid a malloc. In the future, we should investigate if this
+    // is necessary, as having these fields here is not technically necessary
+    long size_evicting_estimate;
+    evictor* ev;
+    pair_list* list;
+
+    // A PAIR is stored in a pair_list (which happens to be PAIR->list).
+    // These variables are protected by the list lock in the pair_list
+    //
+    // clock_next,clock_prev represent a circular doubly-linked list.
+    PAIR clock_next,clock_prev; // In clock.
+    PAIR hash_chain;
+
+    // pending_next,pending_next represent a non-circular doubly-linked list.
+    PAIR pending_next;
+    PAIR pending_prev;
+
+    // cf_next, cf_prev represent a non-circular doubly-linked list.
+    // entries in linked list for PAIRs in a cachefile, these are protected
+    // by the list lock of the PAIR's pair_list. They are used to make
+    // cachetable_flush_cachefile cheaper so that we don't need
+    // to search the entire cachetable to find a particular cachefile's
+    // PAIRs
+    PAIR cf_next;
+    PAIR cf_prev;
+};
+
+//
+// This initializes the fields and members of the pair.
+//
+void pair_init(PAIR p,
+    CACHEFILE cachefile,
+    CACHEKEY key,
+    void *value,
+    PAIR_ATTR attr,
+    enum cachetable_dirty dirty,
+    uint32_t fullhash,
+    CACHETABLE_WRITE_CALLBACK write_callback,
+    evictor *ev,
+    pair_list *list);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//  The pair list maintains the set of PAIR's that make up
+//  the cachetable.
+//
+class pair_list {
+public:
+    //
+    // the following fields are protected by the list lock
+    // 
+    uint32_t m_n_in_table; // number of pairs in the hash table
+    uint32_t m_table_size; // number of buckets in the hash table
+    uint32_t m_num_locks;
+    PAIR *m_table; // hash table
+    toku_mutex_aligned_t *m_mutexes; 
+    // 
+    // The following fields are the heads of various linked lists.
+    // They also protected by the list lock, but their 
+    // usage is not as straightforward. For each of them,
+    // only ONE thread is allowed iterate over them with 
+    // a read lock on the list lock. All other threads
+    // that want to modify elements in the lists or iterate over
+    // the lists must hold the write list lock. Here is the
+    // association between what threads may hold a read lock
+    // on the list lock while iterating:
+    //  - clock_head -> eviction thread (evictor)
+    //  - cleaner_head -> cleaner thread (cleaner)
+    //  - pending_head -> checkpoint thread (checkpointer)
+    //
+    PAIR m_clock_head; // of clock . head is the next thing to be up for decrement. 
+    PAIR m_cleaner_head; // for cleaner thread. head is the next thing to look at for possible cleaning.
+    PAIR m_checkpoint_head; // for begin checkpoint to iterate over PAIRs and mark as pending_checkpoint
+    PAIR m_pending_head; // list of pairs marked with checkpoint_pending
+
+    // this field is public so we are still POD
+
+    // usage of this lock is described above
+    toku_pthread_rwlock_t m_list_lock;
+    //
+    // these locks are the "pending locks" referenced 
+    // in comments about PAIR->checkpoint_pending. There
+    // are two of them, but both serve the same purpose, which
+    // is to protect the transition of a PAIR's checkpoint pending
+    // value from false to true during begin_checkpoint.
+    // We use two locks, because threads that want to read the
+    // checkpoint_pending value may hold a lock for varying periods of time.
+    // Threads running eviction may need to protect checkpoint_pending
+    // while writing a node to disk, which is an expensive operation,
+    // so it uses pending_lock_expensive. Client threads that
+    // want to pin PAIRs will want to protect checkpoint_pending
+    // just long enough to read the value and wipe it out. This is
+    // a cheap operation, and as a result, uses pending_lock_cheap.
+    //
+    // By having two locks, and making begin_checkpoint first 
+    // grab pending_lock_expensive and then pending_lock_cheap,
+    // we ensure that threads that want to pin nodes can grab
+    // only pending_lock_cheap, and never block behind threads
+    // holding pending_lock_expensive and writing a node out to disk
+    //
+    toku_pthread_rwlock_t m_pending_lock_expensive;
+    toku_pthread_rwlock_t m_pending_lock_cheap;
+    void init();
+    void destroy();
+    void evict_completely(PAIR pair);
+    void evict_from_cachetable(PAIR pair);
+    void evict_from_cachefile(PAIR pair);
+    void add_to_cachetable_only(PAIR p);
+    void put(PAIR pair);
+    PAIR find_pair(CACHEFILE file, CACHEKEY key, uint32_t hash);
+    void pending_pairs_remove (PAIR p);
+    void verify();
+    void get_state(int *num_entries, int *hash_size);
+    void read_list_lock();
+    void read_list_unlock();
+    void write_list_lock();
+    void write_list_unlock();
+    void read_pending_exp_lock();
+    void read_pending_exp_unlock();
+    void write_pending_exp_lock();
+    void write_pending_exp_unlock();
+    void read_pending_cheap_lock();
+    void read_pending_cheap_unlock();
+    void write_pending_cheap_lock();
+    void write_pending_cheap_unlock();
+    toku_mutex_t* get_mutex_for_pair(uint32_t fullhash);
+    void pair_lock_by_fullhash(uint32_t fullhash);
+    void pair_unlock_by_fullhash(uint32_t fullhash);
+
+private:
+    void pair_remove (PAIR p);
+    void remove_from_hash_chain(PAIR p);
+    void add_to_cf_list (PAIR p);
+    void add_to_clock (PAIR p);
+    void add_to_hash_chain(PAIR p);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Wrapper for the head of our cachefile list.
+//
+class cachefile_list {
+public:
+    void init();
+    void destroy();
+    void read_lock();
+    void read_unlock();
+    void write_lock();
+    void write_unlock();
+    int cachefile_of_iname_in_env(const char *iname_in_env, CACHEFILE *cf);
+    int cachefile_of_filenum(FILENUM filenum, CACHEFILE *cf);
+    void add_cf_unlocked(CACHEFILE newcf);
+    void add_stale_cf(CACHEFILE newcf);
+    void remove_cf(CACHEFILE cf);
+    void remove_stale_cf_unlocked(CACHEFILE cf);
+    FILENUM reserve_filenum();
+    uint32_t get_new_hash_id_unlocked();
+    CACHEFILE find_cachefile_unlocked(struct fileid* fileid);
+    CACHEFILE find_stale_cachefile_unlocked(struct fileid* fileid);
+    void verify_unused_filenum(FILENUM filenum);
+    bool evict_some_stale_pair(evictor* ev);
+    void free_stale_data(evictor* ev);
+    // access to these fields are protected by the lock
+    FILENUM m_next_filenum_to_use;
+    uint32_t m_next_hash_id_to_use;
+    toku_pthread_rwlock_t m_lock; // this field is publoc so we are still POD
+    toku::omt<CACHEFILE> m_active_filenum;
+    toku::omt<CACHEFILE> m_active_fileid;
+    toku::omt<CACHEFILE> m_stale_fileid;
+private:    
+    CACHEFILE find_cachefile_in_list_unlocked(CACHEFILE start, struct fileid* fileid);
+};
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//  The checkpointer handles starting and finishing checkpoints of the 
+//  cachetable's data.
+//
+class checkpointer {
+public:
+    int init(pair_list *_pl, TOKULOGGER _logger, evictor *_ev, cachefile_list *files);
+    void destroy();
+    void set_checkpoint_period(uint32_t new_period);
+    uint32_t get_checkpoint_period();
+    int shutdown();
+    bool has_been_shutdown();
+    void begin_checkpoint();
+    void add_background_job();
+    void remove_background_job();
+    void end_checkpoint(void (*testcallback_f)(void*),  void* testextra);
+    TOKULOGGER get_logger();
+    // used during begin_checkpoint
+    void increment_num_txns();
+private:
+    uint32_t m_checkpoint_num_txns;   // how many transactions are in the checkpoint
+    TOKULOGGER m_logger;
+    LSN m_lsn_of_checkpoint_in_progress;
+    uint32_t m_checkpoint_num_files; // how many cachefiles are in the checkpoint
+    struct minicron m_checkpointer_cron; // the periodic checkpointing thread
+    cachefile_list *m_cf_list;
+    pair_list *m_list;
+    evictor *m_ev;
+    bool m_checkpointer_cron_init;
+    bool m_checkpointer_init;
+    
+    // variable used by the checkpoint thread to know
+    // when all work induced by cloning on client threads is done
+    BACKGROUND_JOB_MANAGER m_checkpoint_clones_bjm;
+    // private methods for begin_checkpoint    
+    void update_cachefiles();
+    void log_begin_checkpoint();
+    void turn_on_pending_bits();
+    // private methods for end_checkpoint    
+    void fill_checkpoint_cfs(CACHEFILE* checkpoint_cfs);
+    void checkpoint_pending_pairs();
+    void checkpoint_userdata(CACHEFILE* checkpoint_cfs);
+    void log_end_checkpoint();
+    void end_checkpoint_userdata(CACHEFILE* checkpoint_cfs);
+    void remove_cachefiles(CACHEFILE* checkpoint_cfs);
+    
+    // Unit test struct needs access to private members.
+    friend struct checkpointer_test;
+};
+
+//
+// This is how often we want the eviction thread
+// to run, in seconds.
+//
+const int EVICTION_PERIOD = 1;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// The evictor handles the removal of pairs from the pair list/cachetable.
+//
+class evictor {
+public:
+    int init(long _size_limit, pair_list* _pl, cachefile_list* _cf_list, KIBBUTZ _kibbutz, uint32_t eviction_period);
+    void destroy();
+    void add_pair_attr(PAIR_ATTR attr);
+    void remove_pair_attr(PAIR_ATTR attr);    
+    void change_pair_attr(PAIR_ATTR old_attr, PAIR_ATTR new_attr);
+    void add_cloned_data_size(long size);
+    void remove_cloned_data_size(long size);
+    uint64_t reserve_memory(double fraction, uint64_t upper_bound);
+    void release_reserved_memory(uint64_t reserved_memory);
+    void run_eviction_thread();
+    void do_partial_eviction(PAIR p);
+    void evict_pair(PAIR p, bool checkpoint_pending);
+    void wait_for_cache_pressure_to_subside();
+    void signal_eviction_thread();
+    void signal_eviction_thread_locked();
+    bool should_client_thread_sleep();
+    bool should_client_wake_eviction_thread();
+    // function needed for testing
+    void get_state(long *size_current_ptr, long *size_limit_ptr);
+    void fill_engine_status();
+    void set_enable_partial_eviction(bool enabled);
+    bool get_enable_partial_eviction(void) const;
+private:
+    void add_to_size_current(long size);
+    void remove_from_size_current(long size);
+    void run_eviction();
+    bool run_eviction_on_pair(PAIR p);
+    void try_evict_pair(PAIR p);
+    void decrease_size_evicting(long size_evicting_estimate);
+    bool should_sleeping_clients_wakeup();
+    bool eviction_needed();
+
+    // We have some intentional races with these variables because we're ok with reading something a little bit old.
+    // Provide some hooks for reading variables in an unsafe way so that there are function names we can stick in a valgrind suppression.
+    int64_t unsafe_read_size_current(void) const;
+    int64_t unsafe_read_size_evicting(void) const;
+
+    pair_list* m_pl;
+    cachefile_list* m_cf_list;
+    int64_t m_size_current;            // the sum of the sizes of the pairs in the cachetable
+    int64_t m_size_cloned_data; // stores amount of cloned data we have, only used for engine status
+    // changes to these two values are protected
+    // by ev_thread_lock
+    int64_t m_size_reserved;           // How much memory is reserved (e.g., by the loader)
+    int64_t m_size_evicting;           // the sum of the sizes of the pairs being written
+
+    // these are constants
+    int64_t m_low_size_watermark; // target max size of cachetable that eviction thread aims for
+    int64_t m_low_size_hysteresis; // if cachetable grows to this size, client threads wake up eviction thread upon adding data
+    int64_t m_high_size_watermark; // if cachetable grows to this size, client threads sleep upon adding data
+    int64_t m_high_size_hysteresis; // if > cachetable size, then sleeping client threads may wake up
+
+    bool m_enable_partial_eviction; // true if partial evictions are permitted
+
+    // used to calculate random numbers
+    struct random_data m_random_data;
+    char m_random_statebuf[64];
+
+    // mutex that protects fields listed immedietly below
+    toku_mutex_t m_ev_thread_lock;
+    // the eviction thread
+    toku_pthread_t m_ev_thread;
+    // condition variable that controls the sleeping period
+    // of the eviction thread
+    toku_cond_t m_ev_thread_cond;
+    // number of client threads that are currently sleeping
+    // due to an over-subscribed cachetable
+    uint32_t m_num_sleepers;
+    // states if the eviction thread should run. set to true
+    // in init, set to false during destroy
+    bool m_run_thread;
+    // bool that states if the eviction thread is currently running
+    bool m_ev_thread_is_running;
+    // period which the eviction thread sleeps
+    uint32_t m_period_in_seconds;
+    // condition variable on which client threads wait on when sleeping
+    // due to an over-subscribed cachetable
+    toku_cond_t m_flow_control_cond;
+
+    // variables for engine status
+    PARTITIONED_COUNTER m_size_nonleaf;
+    PARTITIONED_COUNTER m_size_leaf;
+    PARTITIONED_COUNTER m_size_rollback;
+    PARTITIONED_COUNTER m_size_cachepressure;
+    PARTITIONED_COUNTER m_wait_pressure_count;
+    PARTITIONED_COUNTER m_wait_pressure_time;
+    PARTITIONED_COUNTER m_long_wait_pressure_count;
+    PARTITIONED_COUNTER m_long_wait_pressure_time;
+    
+    KIBBUTZ m_kibbutz;
+
+    // this variable is ONLY used for testing purposes
+    uint64_t m_num_eviction_thread_runs;
+
+    bool m_ev_thread_init;
+    bool m_evictor_init;
+
+    friend class evictor_test_helpers;
+    friend class evictor_unit_test;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Iterates over the clean head in the pair list, calling the cleaner
+// callback on each node in that list.
+//
+class cleaner {
+public:
+    int init(uint32_t cleaner_iterations, pair_list* _pl, CACHETABLE _ct);
+    void destroy(void);
+    uint32_t get_iterations(void);
+    void set_iterations(uint32_t new_iterations);
+    uint32_t get_period_unlocked(void);
+    void set_period(uint32_t new_period);
+    int run_cleaner(void);
+    
+private:
+    pair_list* m_pl;
+    CACHETABLE m_ct;
+    struct minicron m_cleaner_cron; // the periodic cleaner thread
+    uint32_t m_cleaner_iterations; // how many times to run the cleaner per
+                                  // cleaner period (minicron has a
+                                  // minimum period of 1s so if you want
+                                  // more frequent cleaner runs you must
+                                  // use this)
+    bool m_cleaner_cron_init;
+    bool m_cleaner_init;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// The cachetable is as close to an ENV as we get.
+//
+struct cachetable {
+    pair_list list;
+    cleaner cl;
+    evictor ev;
+    checkpointer cp;
+    cachefile_list cf_list;
+    
+    KIBBUTZ client_kibbutz; // pool of worker threads and jobs to do asynchronously for the client.
+    KIBBUTZ ct_kibbutz; // pool of worker threads and jobs to do asynchronously for the cachetable
+    KIBBUTZ checkpointing_kibbutz; // small pool for checkpointing cloned pairs
+
+    char *env_dir;
+};