Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/common/FastCDC.cc
parent: Initial commit. (diff)
download: ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
1 files changed, 176 insertions, 0 deletions
diff --git a/src/common/FastCDC.cc b/src/common/FastCDC.cc
new file mode 100644
index 000000000..941fc873c
--- /dev/null
+++ b/src/common/FastCDC.cc
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <random>
+
+#include "FastCDC.h"
+
+
+// Unlike FastCDC described in the paper, if we are close to the
+// target, use the target mask.  If we are very small or very large,
+// use an adjusted mask--like the paper.  This tries to keep more
+// cut points using the same mask, and fewer using the small or large
+// masks.
+
+// How many more/fewer bits to set in the small/large masks.
+//
+// This is the "normalization level" or "NC level" in the FastCDC
+// paper.
+#define TARGET_WINDOW_MASK_BITS  2
+
+// How big the 'target window' is (in which we use the target mask).
+//
+// In the FastCDC paper, this is always 0: there is not "target
+// window," and either small_mask (maskS) or large_mask (maskL) is
+// used--never target_mask (maskA).
+#define TARGET_WINDOW_BITS       1
+
+// How many bits larger/smaller than target for hard limits on chunk
+// size.
+//
+// We assume the min and max sizes are always this many bits
+// larger/smaller than the target.  (Note that the FastCDC paper 8KB
+// example has a min of 2KB (2 bits smaller) and max of 64 KB (3 bits
+// larger), although it is not clear why they chose those values.)
+#define SIZE_WINDOW_BITS         2
+
+void FastCDC::_setup(int target, int size_window_bits)
+{
+  target_bits = target;
+
+  if (!size_window_bits) {
+    size_window_bits = SIZE_WINDOW_BITS;
+  }
+  min_bits = target - size_window_bits;
+  max_bits = target + size_window_bits;
+
+  std::mt19937_64 engine;
+
+  // prefill table
+  for (unsigned i = 0; i < 256; ++i) {
+    table[i] = engine();
+  }
+
+  // set mask
+  int did = 0;
+  uint64_t m = 0;
+  while (did < target_bits + TARGET_WINDOW_MASK_BITS) {
+    uint64_t bit = 1ull << (engine() & 63);
+    if (m & bit) {
+      continue;	// this bit is already set
+    }
+    m |= bit;
+    ++did;
+    if (did == target_bits - TARGET_WINDOW_MASK_BITS) {
+      large_mask = m;
+    } else if (did == target_bits) {
+      target_mask = m;
+    } else if (did == target_bits + TARGET_WINDOW_MASK_BITS) {
+      small_mask = m;
+    }
+  }
+}
+
+static inline bool _scan(
+  // these are our cursor/postion...
+  bufferlist::buffers_t::const_iterator *p,
+  const char **pp, const char **pe,
+  size_t& pos,
+  size_t max,   // how much to read
+  uint64_t& fp, // fingerprint
+  uint64_t mask, const uint64_t *table)
+{
+  while (pos < max) {
+    if (*pp == *pe) {
+      ++(*p);
+      *pp = (*p)->c_str();
+      *pe = *pp + (*p)->length();
+    }
+    const char *te = std::min(*pe, *pp + max - pos);
+    for (; *pp < te; ++(*pp), ++pos) {
+      if ((fp & mask) == mask) {
+	return false;
+      }
+      fp = (fp << 1) ^ table[*(unsigned char*)*pp];
+    }
+    if (pos >= max) {
+      return true;
+    }
+  }
+  return true;
+}
+
+void FastCDC::calc_chunks(
+  const bufferlist& bl,
+  std::vector<std::pair<uint64_t, uint64_t>> *chunks) const
+{
+  if (bl.length() == 0) {
+    return;
+  }
+  auto p = bl.buffers().begin();
+  const char *pp = p->c_str();
+  const char *pe = pp + p->length();
+
+  size_t pos = 0;
+  size_t len = bl.length();
+  while (pos < len) {
+    size_t cstart = pos;
+    uint64_t fp = 0;
+
+    // are we left with a min-sized (or smaller) chunk?
+    if (len - pos <= (1ul << min_bits)) {
+      chunks->push_back(std::pair<uint64_t,uint64_t>(pos, len - pos));
+      break;
+    }
+
+    // skip forward to the min chunk size cut point (minus the window, so
+    // we can initialize the rolling fingerprint).
+    size_t skip = (1 << min_bits) - window;
+    pos += skip;
+    while (skip) {
+      size_t s = std::min<size_t>(pe - pp, skip);
+      skip -= s;
+      pp += s;
+      if (pp == pe) {
+	++p;
+	pp = p->c_str();
+	pe = pp + p->length();
+      }
+    }
+
+    // first fill the window
+    size_t max = pos + window;
+    while (pos < max) {
+      if (pp == pe) {
+	++p;
+	pp = p->c_str();
+	pe = pp + p->length();
+      }
+      const char *te = std::min(pe, pp + (max - pos));
+      for (; pp < te; ++pp, ++pos) {
+	fp = (fp << 1) ^ table[*(unsigned char*)pp];
+      }
+    }
+    ceph_assert(pos < len);
+
+    // find an end marker
+    if (
+      // for the first "small" region
+      _scan(&p, &pp, &pe, pos,
+	    std::min(len, cstart + (1 << (target_bits - TARGET_WINDOW_BITS))),
+	    fp, small_mask, table) &&
+      // for the middle range (close to our target)
+      (TARGET_WINDOW_BITS == 0 ||
+       _scan(&p, &pp, &pe, pos,
+	     std::min(len, cstart + (1 << (target_bits + TARGET_WINDOW_BITS))),
+	     fp, target_mask, table)) &&
+      // we're past target, use large_mask!
+      _scan(&p, &pp, &pe, pos,
+	    std::min(len,
+		     cstart + (1 << max_bits)),
+	    fp, large_mask, table))
+      ;
+
+    chunks->push_back(std::pair<uint64_t,uint64_t>(cstart, pos - cstart));
+  }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/common/FastCDC.cc
parent	Initial commit. (diff)
download	ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip