diff options
Diffstat (limited to 'src/common/FastCDC.cc')
-rw-r--r-- | src/common/FastCDC.cc | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/src/common/FastCDC.cc b/src/common/FastCDC.cc new file mode 100644 index 000000000..941fc873c --- /dev/null +++ b/src/common/FastCDC.cc @@ -0,0 +1,176 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <random> + +#include "FastCDC.h" + + +// Unlike FastCDC described in the paper, if we are close to the +// target, use the target mask. If we are very small or very large, +// use an adjusted mask--like the paper. This tries to keep more +// cut points using the same mask, and fewer using the small or large +// masks. + +// How many more/fewer bits to set in the small/large masks. +// +// This is the "normalization level" or "NC level" in the FastCDC +// paper. +#define TARGET_WINDOW_MASK_BITS 2 + +// How big the 'target window' is (in which we use the target mask). +// +// In the FastCDC paper, this is always 0: there is not "target +// window," and either small_mask (maskS) or large_mask (maskL) is +// used--never target_mask (maskA). +#define TARGET_WINDOW_BITS 1 + +// How many bits larger/smaller than target for hard limits on chunk +// size. +// +// We assume the min and max sizes are always this many bits +// larger/smaller than the target. (Note that the FastCDC paper 8KB +// example has a min of 2KB (2 bits smaller) and max of 64 KB (3 bits +// larger), although it is not clear why they chose those values.) +#define SIZE_WINDOW_BITS 2 + +void FastCDC::_setup(int target, int size_window_bits) +{ + target_bits = target; + + if (!size_window_bits) { + size_window_bits = SIZE_WINDOW_BITS; + } + min_bits = target - size_window_bits; + max_bits = target + size_window_bits; + + std::mt19937_64 engine; + + // prefill table + for (unsigned i = 0; i < 256; ++i) { + table[i] = engine(); + } + + // set mask + int did = 0; + uint64_t m = 0; + while (did < target_bits + TARGET_WINDOW_MASK_BITS) { + uint64_t bit = 1ull << (engine() & 63); + if (m & bit) { + continue; // this bit is already set + } + m |= bit; + ++did; + if (did == target_bits - TARGET_WINDOW_MASK_BITS) { + large_mask = m; + } else if (did == target_bits) { + target_mask = m; + } else if (did == target_bits + TARGET_WINDOW_MASK_BITS) { + small_mask = m; + } + } +} + +static inline bool _scan( + // these are our cursor/postion... + bufferlist::buffers_t::const_iterator *p, + const char **pp, const char **pe, + size_t& pos, + size_t max, // how much to read + uint64_t& fp, // fingerprint + uint64_t mask, const uint64_t *table) +{ + while (pos < max) { + if (*pp == *pe) { + ++(*p); + *pp = (*p)->c_str(); + *pe = *pp + (*p)->length(); + } + const char *te = std::min(*pe, *pp + max - pos); + for (; *pp < te; ++(*pp), ++pos) { + if ((fp & mask) == mask) { + return false; + } + fp = (fp << 1) ^ table[*(unsigned char*)*pp]; + } + if (pos >= max) { + return true; + } + } + return true; +} + +void FastCDC::calc_chunks( + const bufferlist& bl, + std::vector<std::pair<uint64_t, uint64_t>> *chunks) const +{ + if (bl.length() == 0) { + return; + } + auto p = bl.buffers().begin(); + const char *pp = p->c_str(); + const char *pe = pp + p->length(); + + size_t pos = 0; + size_t len = bl.length(); + while (pos < len) { + size_t cstart = pos; + uint64_t fp = 0; + + // are we left with a min-sized (or smaller) chunk? + if (len - pos <= (1ul << min_bits)) { + chunks->push_back(std::pair<uint64_t,uint64_t>(pos, len - pos)); + break; + } + + // skip forward to the min chunk size cut point (minus the window, so + // we can initialize the rolling fingerprint). + size_t skip = (1 << min_bits) - window; + pos += skip; + while (skip) { + size_t s = std::min<size_t>(pe - pp, skip); + skip -= s; + pp += s; + if (pp == pe) { + ++p; + pp = p->c_str(); + pe = pp + p->length(); + } + } + + // first fill the window + size_t max = pos + window; + while (pos < max) { + if (pp == pe) { + ++p; + pp = p->c_str(); + pe = pp + p->length(); + } + const char *te = std::min(pe, pp + (max - pos)); + for (; pp < te; ++pp, ++pos) { + fp = (fp << 1) ^ table[*(unsigned char*)pp]; + } + } + ceph_assert(pos < len); + + // find an end marker + if ( + // for the first "small" region + _scan(&p, &pp, &pe, pos, + std::min(len, cstart + (1 << (target_bits - TARGET_WINDOW_BITS))), + fp, small_mask, table) && + // for the middle range (close to our target) + (TARGET_WINDOW_BITS == 0 || + _scan(&p, &pp, &pe, pos, + std::min(len, cstart + (1 << (target_bits + TARGET_WINDOW_BITS))), + fp, target_mask, table)) && + // we're past target, use large_mask! + _scan(&p, &pp, &pe, pos, + std::min(len, + cstart + (1 << max_bits)), + fp, large_mask, table)) + ; + + chunks->push_back(std::pair<uint64_t,uint64_t>(cstart, pos - cstart)); + } +} |