summaryrefslogtreecommitdiffstats
path: root/src/common/cohort_lru.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/common/cohort_lru.h')
-rw-r--r--src/common/cohort_lru.h501
1 files changed, 501 insertions, 0 deletions
diff --git a/src/common/cohort_lru.h b/src/common/cohort_lru.h
new file mode 100644
index 00000000..2383fc95
--- /dev/null
+++ b/src/common/cohort_lru.h
@@ -0,0 +1,501 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copyright (C) 2015 CohortFS, LLC.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef COHORT_LRU_H
+#define COHORT_LRU_H
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/slist.hpp>
+
+#include "common/likely.h"
+
+#ifndef CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE 64 /* XXX arch-specific define */
+#endif
+#define CACHE_PAD(_n) char __pad ## _n [CACHE_LINE_SIZE]
+
+namespace cohort {
+
+ namespace lru {
+
+ namespace bi = boost::intrusive;
+
+ /* public flag values */
+ constexpr uint32_t FLAG_NONE = 0x0000;
+ constexpr uint32_t FLAG_INITIAL = 0x0001;
+ constexpr uint32_t FLAG_RECYCLE = 0x0002;
+
+ enum class Edge : std::uint8_t
+ {
+ MRU = 0,
+ LRU
+ };
+
+ typedef bi::link_mode<bi::safe_link> link_mode;
+
+ class ObjectFactory; // Forward declaration
+
+ class Object
+ {
+ private:
+ uint32_t lru_flags;
+ std::atomic<uint32_t> lru_refcnt;
+ std::atomic<uint32_t> lru_adj;
+ bi::list_member_hook< link_mode > lru_hook;
+
+ typedef bi::list<Object,
+ bi::member_hook<
+ Object, bi::list_member_hook< link_mode >,
+ &Object::lru_hook >,
+ bi::constant_time_size<true>> Queue;
+
+ bi::slist_member_hook< link_mode > q2_hook;
+
+ typedef bi::slist<Object,
+ bi::member_hook<
+ Object, bi::slist_member_hook< link_mode >,
+ &Object::q2_hook >,
+ bi::constant_time_size<true>> Queue2;
+
+ public:
+
+ Object() : lru_flags(FLAG_NONE), lru_refcnt(0), lru_adj(0) {}
+
+ uint32_t get_refcnt() const { return lru_refcnt; }
+
+ virtual bool reclaim(const ObjectFactory* newobj_fac) = 0;
+
+ virtual ~Object() {}
+
+ private:
+ template <typename LK>
+ friend class LRU;
+
+ template <typename T, typename TTree, typename CLT, typename CEQ,
+ typename K, typename LK>
+ friend class TreeX;
+ };
+
+ /* allocator & recycler interface (create or re-use LRU objects) */
+ class ObjectFactory
+ {
+ public:
+ virtual Object* alloc(void) = 0;
+ virtual void recycle(Object*) = 0;
+ virtual ~ObjectFactory() {};
+ };
+
+ template <typename LK>
+ class LRU
+ {
+ private:
+
+ struct Lane {
+ LK lock;
+ Object::Queue q;
+ // Object::Queue pinned; /* placeholder for possible expansion */
+ CACHE_PAD(0);
+ Lane() {}
+ };
+
+ Lane *qlane;
+ int n_lanes;
+ std::atomic<uint32_t> evict_lane;
+ const uint32_t lane_hiwat;
+
+ static constexpr uint32_t lru_adj_modulus = 5;
+
+ static constexpr uint32_t SENTINEL_REFCNT = 1;
+
+ /* internal flag values */
+ static constexpr uint32_t FLAG_INLRU = 0x0001;
+ static constexpr uint32_t FLAG_PINNED = 0x0002; // possible future use
+ static constexpr uint32_t FLAG_EVICTING = 0x0004;
+
+ Lane& lane_of(void* addr) {
+ return qlane[(uint64_t)(addr) % n_lanes];
+ }
+
+ uint32_t next_evict_lane() {
+ return (evict_lane++ % n_lanes);
+ }
+
+ bool can_reclaim(Object* o) {
+ return ((o->lru_refcnt == SENTINEL_REFCNT) &&
+ (!(o->lru_flags & FLAG_EVICTING)));
+ }
+
+ Object* evict_block(const ObjectFactory* newobj_fac) {
+ uint32_t lane_ix = next_evict_lane();
+ for (int ix = 0; ix < n_lanes; ++ix,
+ lane_ix = next_evict_lane()) {
+ Lane& lane = qlane[lane_ix];
+ lane.lock.lock();
+ /* if object at LRU has refcnt==1, it may be reclaimable */
+ Object* o = &(lane.q.back());
+ if (can_reclaim(o)) {
+ ++(o->lru_refcnt);
+ o->lru_flags |= FLAG_EVICTING;
+ lane.lock.unlock();
+ if (o->reclaim(newobj_fac)) {
+ lane.lock.lock();
+ --(o->lru_refcnt);
+ /* assertions that o state has not changed across
+ * relock */
+ ceph_assert(o->lru_refcnt == SENTINEL_REFCNT);
+ ceph_assert(o->lru_flags & FLAG_INLRU);
+ Object::Queue::iterator it =
+ Object::Queue::s_iterator_to(*o);
+ lane.q.erase(it);
+ lane.lock.unlock();
+ return o;
+ } else {
+ // XXX can't make unreachable (means what?)
+ --(o->lru_refcnt);
+ o->lru_flags &= ~FLAG_EVICTING;
+ /* unlock in next block */
+ }
+ } /* can_reclaim(o) */
+ lane.lock.unlock();
+ } /* each lane */
+ return nullptr;
+ } /* evict_block */
+
+ public:
+
+ LRU(int lanes, uint32_t _hiwat)
+ : n_lanes(lanes), evict_lane(0), lane_hiwat(_hiwat)
+ {
+ ceph_assert(n_lanes > 0);
+ qlane = new Lane[n_lanes];
+ }
+
+ ~LRU() { delete[] qlane; }
+
+ bool ref(Object* o, uint32_t flags) {
+ ++(o->lru_refcnt);
+ if (flags & FLAG_INITIAL) {
+ if ((++(o->lru_adj) % lru_adj_modulus) == 0) {
+ Lane& lane = lane_of(o);
+ lane.lock.lock();
+ /* move to MRU */
+ Object::Queue::iterator it =
+ Object::Queue::s_iterator_to(*o);
+ lane.q.erase(it);
+ lane.q.push_front(*o);
+ lane.lock.unlock();
+ } /* adj */
+ } /* initial ref */
+ return true;
+ } /* ref */
+
+ void unref(Object* o, uint32_t flags) {
+ uint32_t refcnt = --(o->lru_refcnt);
+ Object* tdo = nullptr;
+ if (unlikely(refcnt == 0)) {
+ Lane& lane = lane_of(o);
+ lane.lock.lock();
+ refcnt = o->lru_refcnt.load();
+ if (unlikely(refcnt == 0)) {
+ Object::Queue::iterator it =
+ Object::Queue::s_iterator_to(*o);
+ lane.q.erase(it);
+ tdo = o;
+ }
+ lane.lock.unlock();
+ } else if (unlikely(refcnt == SENTINEL_REFCNT)) {
+ Lane& lane = lane_of(o);
+ lane.lock.lock();
+ refcnt = o->lru_refcnt.load();
+ if (likely(refcnt == SENTINEL_REFCNT)) {
+ /* move to LRU */
+ Object::Queue::iterator it =
+ Object::Queue::s_iterator_to(*o);
+ lane.q.erase(it);
+ /* hiwat check */
+ if (lane.q.size() > lane_hiwat) {
+ tdo = o;
+ } else {
+ lane.q.push_back(*o);
+ }
+ }
+ lane.lock.unlock();
+ }
+ /* unref out-of-line && !LOCKED */
+ if (tdo)
+ delete tdo;
+ } /* unref */
+
+ Object* insert(ObjectFactory* fac, Edge edge, uint32_t& flags) {
+ /* use supplied functor to re-use an evicted object, or
+ * allocate a new one of the descendant type */
+ Object* o = evict_block(fac);
+ if (o) {
+ fac->recycle(o); /* recycle existing object */
+ flags |= FLAG_RECYCLE;
+ }
+ else
+ o = fac->alloc(); /* get a new one */
+
+ o->lru_flags = FLAG_INLRU;
+
+ Lane& lane = lane_of(o);
+ lane.lock.lock();
+ switch (edge) {
+ case Edge::MRU:
+ lane.q.push_front(*o);
+ break;
+ case Edge::LRU:
+ lane.q.push_back(*o);
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+ if (flags & FLAG_INITIAL)
+ o->lru_refcnt += 2; /* sentinel ref + initial */
+ else
+ ++(o->lru_refcnt); /* sentinel */
+ lane.lock.unlock();
+ return o;
+ } /* insert */
+
+ };
+
+ template <typename T, typename TTree, typename CLT, typename CEQ,
+ typename K, typename LK>
+ class TreeX
+ {
+ public:
+
+ static constexpr uint32_t FLAG_NONE = 0x0000;
+ static constexpr uint32_t FLAG_LOCK = 0x0001;
+ static constexpr uint32_t FLAG_UNLOCK = 0x0002;
+ static constexpr uint32_t FLAG_UNLOCK_ON_MISS = 0x0004;
+
+ typedef T value_type;
+ typedef TTree container_type;
+ typedef typename TTree::iterator iterator;
+ typedef std::pair<iterator, bool> check_result;
+ typedef typename TTree::insert_commit_data insert_commit_data;
+ int n_part;
+ int csz;
+
+ typedef std::unique_lock<LK> unique_lock;
+
+ struct Partition {
+ LK lock;
+ TTree tr;
+ T** cache;
+ int csz;
+ CACHE_PAD(0);
+
+ Partition() : tr(), cache(nullptr), csz(0) {}
+
+ ~Partition() {
+ if (csz)
+ ::operator delete(cache);
+ }
+ };
+
+ struct Latch {
+ Partition* p;
+ LK* lock;
+ insert_commit_data commit_data{};
+
+ Latch() : p(nullptr), lock(nullptr) {}
+ };
+
+ Partition& partition_of_scalar(uint64_t x) {
+ return part[x % n_part];
+ }
+
+ Partition& get(uint8_t x) {
+ return part[x];
+ }
+
+ Partition*& get() {
+ return part;
+ }
+
+ void lock() {
+ std::for_each(locks.begin(), locks.end(),
+ [](LK* lk){ lk->lock(); });
+ }
+
+ void unlock() {
+ std::for_each(locks.begin(), locks.end(),
+ [](LK* lk){ lk->unlock(); });
+ }
+
+ TreeX(int n_part=1, int csz=127) : n_part(n_part), csz(csz) {
+ ceph_assert(n_part > 0);
+ part = new Partition[n_part];
+ for (int ix = 0; ix < n_part; ++ix) {
+ Partition& p = part[ix];
+ if (csz) {
+ p.csz = csz;
+ p.cache = (T**) ::operator new(csz * sizeof(T*));
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(p.cache, 0, csz * sizeof(T*));
+ }
+ locks.push_back(&p.lock);
+ }
+ }
+
+ ~TreeX() {
+ delete[] part;
+ }
+
+ T* find(uint64_t hk, const K& k, uint32_t flags) {
+ T* v;
+ Latch lat;
+ uint32_t slot = 0;
+ lat.p = &(partition_of_scalar(hk));
+ if (flags & FLAG_LOCK) {
+ lat.lock = &lat.p->lock;
+ lat.lock->lock();
+ }
+ if (csz) { /* template specialize? */
+ slot = hk % csz;
+ v = lat.p->cache[slot];
+ if (v) {
+ if (CEQ()(*v, k)) {
+ if (flags & FLAG_LOCK)
+ lat.lock->unlock();
+ return v;
+ }
+ v = nullptr;
+ }
+ } else {
+ v = nullptr;
+ }
+ iterator it = lat.p->tr.find(k, CLT());
+ if (it != lat.p->tr.end()){
+ v = &(*(it));
+ if (csz) {
+ /* fill cache slot at hk */
+ lat.p->cache[slot] = v;
+ }
+ }
+ if (flags & FLAG_LOCK)
+ lat.lock->unlock();
+ return v;
+ } /* find */
+
+ T* find_latch(uint64_t hk, const K& k, Latch& lat,
+ uint32_t flags) {
+ uint32_t slot = 0;
+ T* v;
+ lat.p = &(partition_of_scalar(hk));
+ lat.lock = &lat.p->lock;
+ if (flags & FLAG_LOCK)
+ lat.lock->lock();
+ if (csz) { /* template specialize? */
+ slot = hk % csz;
+ v = lat.p->cache[slot];
+ if (v) {
+ if (CEQ()(*v, k)) {
+ if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK))
+ lat.lock->unlock();
+ return v;
+ }
+ v = nullptr;
+ }
+ } else {
+ v = nullptr;
+ }
+ check_result r = lat.p->tr.insert_unique_check(
+ k, CLT(), lat.commit_data);
+ if (! r.second /* !insertable (i.e., !found) */) {
+ v = &(*(r.first));
+ if (csz) {
+ /* fill cache slot at hk */
+ lat.p->cache[slot] = v;
+ }
+ }
+ if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK))
+ lat.lock->unlock();
+ return v;
+ } /* find_latch */
+ bool is_same_partition(uint64_t lhs, uint64_t rhs) {
+ return ((lhs % n_part) == (rhs % n_part));
+ }
+ void insert_latched(T* v, Latch& lat, uint32_t flags) {
+ (void) lat.p->tr.insert_unique_commit(*v, lat.commit_data);
+ if (flags & FLAG_UNLOCK)
+ lat.lock->unlock();
+ } /* insert_latched */
+
+ void insert(uint64_t hk, T* v, uint32_t flags) {
+ Partition& p = partition_of_scalar(hk);
+ if (flags & FLAG_LOCK)
+ p.lock.lock();
+ p.tr.insert_unique(*v);
+ if (flags & FLAG_LOCK)
+ p.lock.unlock();
+ } /* insert */
+
+ void remove(uint64_t hk, T* v, uint32_t flags) {
+ Partition& p = partition_of_scalar(hk);
+ iterator it = TTree::s_iterator_to(*v);
+ if (flags & FLAG_LOCK)
+ p.lock.lock();
+ p.tr.erase(it);
+ if (csz) { /* template specialize? */
+ uint32_t slot = hk % csz;
+ T* v2 = p.cache[slot];
+ /* we are intrusive, just compare addresses */
+ if (v == v2)
+ p.cache[slot] = nullptr;
+ }
+ if (flags & FLAG_LOCK)
+ p.lock.unlock();
+ } /* remove */
+
+ void drain(std::function<void(T*)> uref,
+ uint32_t flags = FLAG_NONE) {
+ /* clear a table, call supplied function on
+ * each element found (e.g., returns sentinel
+ * references) */
+ Object::Queue2 drain_q;
+ for (int t_ix = 0; t_ix < n_part; ++t_ix) {
+ Partition& p = part[t_ix];
+ if (flags & FLAG_LOCK) /* LOCKED */
+ p.lock.lock();
+ while (p.tr.size() > 0) {
+ iterator it = p.tr.begin();
+ T* v = &(*it);
+ p.tr.erase(it);
+ drain_q.push_front(*v);
+ }
+ if (flags & FLAG_LOCK) /* we locked it, !LOCKED */
+ p.lock.unlock();
+ } /* each partition */
+ /* unref out-of-line && !LOCKED */
+ while (drain_q.size() > 0) {
+ Object::Queue2::iterator it = drain_q.begin();
+ T* v = static_cast<T*>(&(*it));
+ drain_q.erase(it); /* must precede uref(v) in safe_link mode */
+ uref(v);
+ }
+ } /* drain */
+
+ private:
+ Partition *part;
+ std::vector<LK*> locks;
+ };
+
+ } /* namespace LRU */
+} /* namespace cohort */
+
+#endif /* COHORT_LRU_H */