diff options
Diffstat (limited to 'src/backend/lib')
-rw-r--r-- | src/backend/lib/Makefile | 27 | ||||
-rw-r--r-- | src/backend/lib/README | 35 | ||||
-rw-r--r-- | src/backend/lib/binaryheap.c | 307 | ||||
-rw-r--r-- | src/backend/lib/bipartite_match.c | 180 | ||||
-rw-r--r-- | src/backend/lib/bloomfilter.c | 295 | ||||
-rw-r--r-- | src/backend/lib/dshash.c | 882 | ||||
-rw-r--r-- | src/backend/lib/hyperloglog.c | 255 | ||||
-rw-r--r-- | src/backend/lib/ilist.c | 111 | ||||
-rw-r--r-- | src/backend/lib/integerset.c | 1045 | ||||
-rw-r--r-- | src/backend/lib/knapsack.c | 111 | ||||
-rw-r--r-- | src/backend/lib/pairingheap.c | 333 | ||||
-rw-r--r-- | src/backend/lib/rbtree.c | 770 |
12 files changed, 4351 insertions, 0 deletions
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile new file mode 100644 index 0000000..9dad313 --- /dev/null +++ b/src/backend/lib/Makefile @@ -0,0 +1,27 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for lib (miscellaneous stuff) +# +# IDENTIFICATION +# src/backend/lib/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/lib +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + binaryheap.o \ + bipartite_match.o \ + bloomfilter.o \ + dshash.o \ + hyperloglog.o \ + ilist.o \ + integerset.o \ + knapsack.o \ + pairingheap.o \ + rbtree.o \ + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/README b/src/backend/lib/README new file mode 100644 index 0000000..f2fb591 --- /dev/null +++ b/src/backend/lib/README @@ -0,0 +1,35 @@ +This directory contains a general purpose data structures, for use anywhere +in the backend: + +binaryheap.c - a binary heap + +bipartite_match.c - Hopcroft-Karp maximum cardinality algorithm for bipartite graphs + +bloomfilter.c - probabilistic, space-efficient set membership testing + +dshash.c - concurrent hash tables backed by dynamic shared memory areas + +hyperloglog.c - a streaming cardinality estimator + +ilist.c - single and double-linked lists + +integerset.c - a data structure for holding large set of integers + +knapsack.c - knapsack problem solver + +pairingheap.c - a pairing heap + +rbtree.c - a red-black tree + +stringinfo.c - an extensible string type + + +Aside from the inherent characteristics of the data structures, there are a +few practical differences between the binary heap and the pairing heap. The +binary heap is fully allocated at creation, and cannot be expanded beyond the +allocated size. The pairing heap on the other hand has no inherent maximum +size, but the caller needs to allocate each element being stored in the heap, +while the binary heap works with plain Datums or pointers. + +The linked-lists in ilist.c can be embedded directly into other structs, as +opposed to the List interface in nodes/pg_list.h. diff --git a/src/backend/lib/binaryheap.c b/src/backend/lib/binaryheap.c new file mode 100644 index 0000000..d54e245 --- /dev/null +++ b/src/backend/lib/binaryheap.c @@ -0,0 +1,307 @@ +/*------------------------------------------------------------------------- + * + * binaryheap.c + * A simple binary heap implementation + * + * Portions Copyright (c) 2012-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/binaryheap.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <math.h> + +#include "lib/binaryheap.h" + +static void sift_down(binaryheap *heap, int node_off); +static void sift_up(binaryheap *heap, int node_off); +static inline void swap_nodes(binaryheap *heap, int a, int b); + +/* + * binaryheap_allocate + * + * Returns a pointer to a newly-allocated heap that has the capacity to + * store the given number of nodes, with the heap property defined by + * the given comparator function, which will be invoked with the additional + * argument specified by 'arg'. + */ +binaryheap * +binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg) +{ + int sz; + binaryheap *heap; + + sz = offsetof(binaryheap, bh_nodes) + sizeof(Datum) * capacity; + heap = (binaryheap *) palloc(sz); + heap->bh_space = capacity; + heap->bh_compare = compare; + heap->bh_arg = arg; + + heap->bh_size = 0; + heap->bh_has_heap_property = true; + + return heap; +} + +/* + * binaryheap_reset + * + * Resets the heap to an empty state, losing its data content but not the + * parameters passed at allocation. + */ +void +binaryheap_reset(binaryheap *heap) +{ + heap->bh_size = 0; + heap->bh_has_heap_property = true; +} + +/* + * binaryheap_free + * + * Releases memory used by the given binaryheap. + */ +void +binaryheap_free(binaryheap *heap) +{ + pfree(heap); +} + +/* + * These utility functions return the offset of the left child, right + * child, and parent of the node at the given index, respectively. + * + * The heap is represented as an array of nodes, with the root node + * stored at index 0. The left child of node i is at index 2*i+1, and + * the right child at 2*i+2. The parent of node i is at index (i-1)/2. + */ + +static inline int +left_offset(int i) +{ + return 2 * i + 1; +} + +static inline int +right_offset(int i) +{ + return 2 * i + 2; +} + +static inline int +parent_offset(int i) +{ + return (i - 1) / 2; +} + +/* + * binaryheap_add_unordered + * + * Adds the given datum to the end of the heap's list of nodes in O(1) without + * preserving the heap property. This is a convenience to add elements quickly + * to a new heap. To obtain a valid heap, one must call binaryheap_build() + * afterwards. + */ +void +binaryheap_add_unordered(binaryheap *heap, Datum d) +{ + if (heap->bh_size >= heap->bh_space) + elog(ERROR, "out of binary heap slots"); + heap->bh_has_heap_property = false; + heap->bh_nodes[heap->bh_size] = d; + heap->bh_size++; +} + +/* + * binaryheap_build + * + * Assembles a valid heap in O(n) from the nodes added by + * binaryheap_add_unordered(). Not needed otherwise. + */ +void +binaryheap_build(binaryheap *heap) +{ + int i; + + for (i = parent_offset(heap->bh_size - 1); i >= 0; i--) + sift_down(heap, i); + heap->bh_has_heap_property = true; +} + +/* + * binaryheap_add + * + * Adds the given datum to the heap in O(log n) time, while preserving + * the heap property. + */ +void +binaryheap_add(binaryheap *heap, Datum d) +{ + if (heap->bh_size >= heap->bh_space) + elog(ERROR, "out of binary heap slots"); + heap->bh_nodes[heap->bh_size] = d; + heap->bh_size++; + sift_up(heap, heap->bh_size - 1); +} + +/* + * binaryheap_first + * + * Returns a pointer to the first (root, topmost) node in the heap + * without modifying the heap. The caller must ensure that this + * routine is not used on an empty heap. Always O(1). + */ +Datum +binaryheap_first(binaryheap *heap) +{ + Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property); + return heap->bh_nodes[0]; +} + +/* + * binaryheap_remove_first + * + * Removes the first (root, topmost) node in the heap and returns a + * pointer to it after rebalancing the heap. The caller must ensure + * that this routine is not used on an empty heap. O(log n) worst + * case. + */ +Datum +binaryheap_remove_first(binaryheap *heap) +{ + Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property); + + if (heap->bh_size == 1) + { + heap->bh_size--; + return heap->bh_nodes[0]; + } + + /* + * Swap the root and last nodes, decrease the size of the heap (i.e. + * remove the former root node) and sift the new root node down to its + * correct position. + */ + swap_nodes(heap, 0, heap->bh_size - 1); + heap->bh_size--; + sift_down(heap, 0); + + return heap->bh_nodes[heap->bh_size]; +} + +/* + * binaryheap_replace_first + * + * Replace the topmost element of a non-empty heap, preserving the heap + * property. O(1) in the best case, or O(log n) if it must fall back to + * sifting the new node down. + */ +void +binaryheap_replace_first(binaryheap *heap, Datum d) +{ + Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property); + + heap->bh_nodes[0] = d; + + if (heap->bh_size > 1) + sift_down(heap, 0); +} + +/* + * Swap the contents of two nodes. + */ +static inline void +swap_nodes(binaryheap *heap, int a, int b) +{ + Datum swap; + + swap = heap->bh_nodes[a]; + heap->bh_nodes[a] = heap->bh_nodes[b]; + heap->bh_nodes[b] = swap; +} + +/* + * Sift a node up to the highest position it can hold according to the + * comparator. + */ +static void +sift_up(binaryheap *heap, int node_off) +{ + while (node_off != 0) + { + int cmp; + int parent_off; + + /* + * If this node is smaller than its parent, the heap condition is + * satisfied, and we're done. + */ + parent_off = parent_offset(node_off); + cmp = heap->bh_compare(heap->bh_nodes[node_off], + heap->bh_nodes[parent_off], + heap->bh_arg); + if (cmp <= 0) + break; + + /* + * Otherwise, swap the node and its parent and go on to check the + * node's new parent. + */ + swap_nodes(heap, node_off, parent_off); + node_off = parent_off; + } +} + +/* + * Sift a node down from its current position to satisfy the heap + * property. + */ +static void +sift_down(binaryheap *heap, int node_off) +{ + while (true) + { + int left_off = left_offset(node_off); + int right_off = right_offset(node_off); + int swap_off = 0; + + /* Is the left child larger than the parent? */ + if (left_off < heap->bh_size && + heap->bh_compare(heap->bh_nodes[node_off], + heap->bh_nodes[left_off], + heap->bh_arg) < 0) + swap_off = left_off; + + /* Is the right child larger than the parent? */ + if (right_off < heap->bh_size && + heap->bh_compare(heap->bh_nodes[node_off], + heap->bh_nodes[right_off], + heap->bh_arg) < 0) + { + /* swap with the larger child */ + if (!swap_off || + heap->bh_compare(heap->bh_nodes[left_off], + heap->bh_nodes[right_off], + heap->bh_arg) < 0) + swap_off = right_off; + } + + /* + * If we didn't find anything to swap, the heap condition is + * satisfied, and we're done. + */ + if (!swap_off) + break; + + /* + * Otherwise, swap the node with the child that violates the heap + * property; then go on to check its children. + */ + swap_nodes(heap, swap_off, node_off); + node_off = swap_off; + } +} diff --git a/src/backend/lib/bipartite_match.c b/src/backend/lib/bipartite_match.c new file mode 100644 index 0000000..baa1c13 --- /dev/null +++ b/src/backend/lib/bipartite_match.c @@ -0,0 +1,180 @@ +/*------------------------------------------------------------------------- + * + * bipartite_match.c + * Hopcroft-Karp maximum cardinality algorithm for bipartite graphs + * + * This implementation is based on pseudocode found at: + * + * https://en.wikipedia.org/w/index.php?title=Hopcroft%E2%80%93Karp_algorithm&oldid=593898016 + * + * Copyright (c) 2015-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/bipartite_match.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <limits.h> + +#include "lib/bipartite_match.h" +#include "miscadmin.h" + +/* + * The distances computed in hk_breadth_search can easily be seen to never + * exceed u_size. Since we restrict u_size to be less than SHRT_MAX, we + * can therefore use SHRT_MAX as the "infinity" distance needed as a marker. + */ +#define HK_INFINITY SHRT_MAX + +static bool hk_breadth_search(BipartiteMatchState *state); +static bool hk_depth_search(BipartiteMatchState *state, int u); + +/* + * Given the size of U and V, where each is indexed 1..size, and an adjacency + * list, perform the matching and return the resulting state. + */ +BipartiteMatchState * +BipartiteMatch(int u_size, int v_size, short **adjacency) +{ + BipartiteMatchState *state = palloc(sizeof(BipartiteMatchState)); + + if (u_size < 0 || u_size >= SHRT_MAX || + v_size < 0 || v_size >= SHRT_MAX) + elog(ERROR, "invalid set size for BipartiteMatch"); + + state->u_size = u_size; + state->v_size = v_size; + state->adjacency = adjacency; + state->matching = 0; + state->pair_uv = (short *) palloc0((u_size + 1) * sizeof(short)); + state->pair_vu = (short *) palloc0((v_size + 1) * sizeof(short)); + state->distance = (short *) palloc((u_size + 1) * sizeof(short)); + state->queue = (short *) palloc((u_size + 2) * sizeof(short)); + + while (hk_breadth_search(state)) + { + int u; + + for (u = 1; u <= u_size; u++) + { + if (state->pair_uv[u] == 0) + if (hk_depth_search(state, u)) + state->matching++; + } + + CHECK_FOR_INTERRUPTS(); /* just in case */ + } + + return state; +} + +/* + * Free a state returned by BipartiteMatch, except for the original adjacency + * list, which is owned by the caller. This only frees memory, so it's optional. + */ +void +BipartiteMatchFree(BipartiteMatchState *state) +{ + /* adjacency matrix is treated as owned by the caller */ + pfree(state->pair_uv); + pfree(state->pair_vu); + pfree(state->distance); + pfree(state->queue); + pfree(state); +} + +/* + * Perform the breadth-first search step of H-K matching. + * Returns true if successful. + */ +static bool +hk_breadth_search(BipartiteMatchState *state) +{ + int usize = state->u_size; + short *queue = state->queue; + short *distance = state->distance; + int qhead = 0; /* we never enqueue any node more than once */ + int qtail = 0; /* so don't have to worry about wrapping */ + int u; + + distance[0] = HK_INFINITY; + + for (u = 1; u <= usize; u++) + { + if (state->pair_uv[u] == 0) + { + distance[u] = 0; + queue[qhead++] = u; + } + else + distance[u] = HK_INFINITY; + } + + while (qtail < qhead) + { + u = queue[qtail++]; + + if (distance[u] < distance[0]) + { + short *u_adj = state->adjacency[u]; + int i = u_adj ? u_adj[0] : 0; + + for (; i > 0; i--) + { + int u_next = state->pair_vu[u_adj[i]]; + + if (distance[u_next] == HK_INFINITY) + { + distance[u_next] = 1 + distance[u]; + Assert(qhead < usize + 2); + queue[qhead++] = u_next; + } + } + } + } + + return (distance[0] != HK_INFINITY); +} + +/* + * Perform the depth-first search step of H-K matching. + * Returns true if successful. + */ +static bool +hk_depth_search(BipartiteMatchState *state, int u) +{ + short *distance = state->distance; + short *pair_uv = state->pair_uv; + short *pair_vu = state->pair_vu; + short *u_adj = state->adjacency[u]; + int i = u_adj ? u_adj[0] : 0; + short nextdist; + + if (u == 0) + return true; + if (distance[u] == HK_INFINITY) + return false; + nextdist = distance[u] + 1; + + check_stack_depth(); + + for (; i > 0; i--) + { + int v = u_adj[i]; + + if (distance[pair_vu[v]] == nextdist) + { + if (hk_depth_search(state, pair_vu[v])) + { + pair_vu[v] = u; + pair_uv[u] = v; + return true; + } + } + } + + distance[u] = HK_INFINITY; + return false; +} diff --git a/src/backend/lib/bloomfilter.c b/src/backend/lib/bloomfilter.c new file mode 100644 index 0000000..daf2c40 --- /dev/null +++ b/src/backend/lib/bloomfilter.c @@ -0,0 +1,295 @@ +/*------------------------------------------------------------------------- + * + * bloomfilter.c + * Space-efficient set membership testing + * + * A Bloom filter is a probabilistic data structure that is used to test an + * element's membership of a set. False positives are possible, but false + * negatives are not; a test of membership of the set returns either "possibly + * in set" or "definitely not in set". This is typically very space efficient, + * which can be a decisive advantage. + * + * Elements can be added to the set, but not removed. The more elements that + * are added, the larger the probability of false positives. Caller must hint + * an estimated total size of the set when the Bloom filter is initialized. + * This is used to balance the use of memory against the final false positive + * rate. + * + * The implementation is well suited to data synchronization problems between + * unordered sets, especially where predictable performance is important and + * some false positives are acceptable. It's also well suited to cache + * filtering problems where a relatively small and/or low cardinality set is + * fingerprinted, especially when many subsequent membership tests end up + * indicating that values of interest are not present. That should save the + * caller many authoritative lookups, such as expensive probes of a much larger + * on-disk structure. + * + * Copyright (c) 2018-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/bloomfilter.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "common/hashfn.h" +#include "lib/bloomfilter.h" +#include "port/pg_bitutils.h" + +#define MAX_HASH_FUNCS 10 + +struct bloom_filter +{ + /* K hash functions are used, seeded by caller's seed */ + int k_hash_funcs; + uint64 seed; + /* m is bitset size, in bits. Must be a power of two <= 2^32. */ + uint64 m; + unsigned char bitset[FLEXIBLE_ARRAY_MEMBER]; +}; + +static int my_bloom_power(uint64 target_bitset_bits); +static int optimal_k(uint64 bitset_bits, int64 total_elems); +static void k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem, + size_t len); +static inline uint32 mod_m(uint32 a, uint64 m); + +/* + * Create Bloom filter in caller's memory context. We aim for a false positive + * rate of between 1% and 2% when bitset size is not constrained by memory + * availability. + * + * total_elems is an estimate of the final size of the set. It should be + * approximately correct, but the implementation can cope well with it being + * off by perhaps a factor of five or more. See "Bloom Filters in + * Probabilistic Verification" (Dillinger & Manolios, 2004) for details of why + * this is the case. + * + * bloom_work_mem is sized in KB, in line with the general work_mem convention. + * This determines the size of the underlying bitset (trivial bookkeeping space + * isn't counted). The bitset is always sized as a power of two number of + * bits, and the largest possible bitset is 512MB (2^32 bits). The + * implementation allocates only enough memory to target its standard false + * positive rate, using a simple formula with caller's total_elems estimate as + * an input. The bitset might be as small as 1MB, even when bloom_work_mem is + * much higher. + * + * The Bloom filter is seeded using a value provided by the caller. Using a + * distinct seed value on every call makes it unlikely that the same false + * positives will reoccur when the same set is fingerprinted a second time. + * Callers that don't care about this pass a constant as their seed, typically + * 0. Callers can use a pseudo-random seed in the range of 0 - INT_MAX by + * calling random(). + */ +bloom_filter * +bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed) +{ + bloom_filter *filter; + int bloom_power; + uint64 bitset_bytes; + uint64 bitset_bits; + + /* + * Aim for two bytes per element; this is sufficient to get a false + * positive rate below 1%, independent of the size of the bitset or total + * number of elements. Also, if rounding down the size of the bitset to + * the next lowest power of two turns out to be a significant drop, the + * false positive rate still won't exceed 2% in almost all cases. + */ + bitset_bytes = Min(bloom_work_mem * UINT64CONST(1024), total_elems * 2); + bitset_bytes = Max(1024 * 1024, bitset_bytes); + + /* + * Size in bits should be the highest power of two <= target. bitset_bits + * is uint64 because PG_UINT32_MAX is 2^32 - 1, not 2^32 + */ + bloom_power = my_bloom_power(bitset_bytes * BITS_PER_BYTE); + bitset_bits = UINT64CONST(1) << bloom_power; + bitset_bytes = bitset_bits / BITS_PER_BYTE; + + /* Allocate bloom filter with unset bitset */ + filter = palloc0(offsetof(bloom_filter, bitset) + + sizeof(unsigned char) * bitset_bytes); + filter->k_hash_funcs = optimal_k(bitset_bits, total_elems); + filter->seed = seed; + filter->m = bitset_bits; + + return filter; +} + +/* + * Free Bloom filter + */ +void +bloom_free(bloom_filter *filter) +{ + pfree(filter); +} + +/* + * Add element to Bloom filter + */ +void +bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len) +{ + uint32 hashes[MAX_HASH_FUNCS]; + int i; + + k_hashes(filter, hashes, elem, len); + + /* Map a bit-wise address to a byte-wise address + bit offset */ + for (i = 0; i < filter->k_hash_funcs; i++) + { + filter->bitset[hashes[i] >> 3] |= 1 << (hashes[i] & 7); + } +} + +/* + * Test if Bloom filter definitely lacks element. + * + * Returns true if the element is definitely not in the set of elements + * observed by bloom_add_element(). Otherwise, returns false, indicating that + * element is probably present in set. + */ +bool +bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len) +{ + uint32 hashes[MAX_HASH_FUNCS]; + int i; + + k_hashes(filter, hashes, elem, len); + + /* Map a bit-wise address to a byte-wise address + bit offset */ + for (i = 0; i < filter->k_hash_funcs; i++) + { + if (!(filter->bitset[hashes[i] >> 3] & (1 << (hashes[i] & 7)))) + return true; + } + + return false; +} + +/* + * What proportion of bits are currently set? + * + * Returns proportion, expressed as a multiplier of filter size. That should + * generally be close to 0.5, even when we have more than enough memory to + * ensure a false positive rate within target 1% to 2% band, since more hash + * functions are used as more memory is available per element. + * + * This is the only instrumentation that is low overhead enough to appear in + * debug traces. When debugging Bloom filter code, it's likely to be far more + * interesting to directly test the false positive rate. + */ +double +bloom_prop_bits_set(bloom_filter *filter) +{ + int bitset_bytes = filter->m / BITS_PER_BYTE; + uint64 bits_set = pg_popcount((char *) filter->bitset, bitset_bytes); + + return bits_set / (double) filter->m; +} + +/* + * Which element in the sequence of powers of two is less than or equal to + * target_bitset_bits? + * + * Value returned here must be generally safe as the basis for actual bitset + * size. + * + * Bitset is never allowed to exceed 2 ^ 32 bits (512MB). This is sufficient + * for the needs of all current callers, and allows us to use 32-bit hash + * functions. It also makes it easy to stay under the MaxAllocSize restriction + * (caller needs to leave room for non-bitset fields that appear before + * flexible array member, so a 1GB bitset would use an allocation that just + * exceeds MaxAllocSize). + */ +static int +my_bloom_power(uint64 target_bitset_bits) +{ + int bloom_power = -1; + + while (target_bitset_bits > 0 && bloom_power < 32) + { + bloom_power++; + target_bitset_bits >>= 1; + } + + return bloom_power; +} + +/* + * Determine optimal number of hash functions based on size of filter in bits, + * and projected total number of elements. The optimal number is the number + * that minimizes the false positive rate. + */ +static int +optimal_k(uint64 bitset_bits, int64 total_elems) +{ + int k = rint(log(2.0) * bitset_bits / total_elems); + + return Max(1, Min(k, MAX_HASH_FUNCS)); +} + +/* + * Generate k hash values for element. + * + * Caller passes array, which is filled-in with k values determined by hashing + * caller's element. + * + * Only 2 real independent hash functions are actually used to support an + * interface of up to MAX_HASH_FUNCS hash functions; enhanced double hashing is + * used to make this work. The main reason we prefer enhanced double hashing + * to classic double hashing is that the latter has an issue with collisions + * when using power of two sized bitsets. See Dillinger & Manolios for full + * details. + */ +static void +k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem, size_t len) +{ + uint64 hash; + uint32 x, + y; + uint64 m; + int i; + + /* Use 64-bit hashing to get two independent 32-bit hashes */ + hash = DatumGetUInt64(hash_any_extended(elem, len, filter->seed)); + x = (uint32) hash; + y = (uint32) (hash >> 32); + m = filter->m; + + x = mod_m(x, m); + y = mod_m(y, m); + + /* Accumulate hashes */ + hashes[0] = x; + for (i = 1; i < filter->k_hash_funcs; i++) + { + x = mod_m(x + y, m); + y = mod_m(y + i, m); + + hashes[i] = x; + } +} + +/* + * Calculate "val MOD m" inexpensively. + * + * Assumes that m (which is bitset size) is a power of two. + * + * Using a power of two number of bits for bitset size allows us to use bitwise + * AND operations to calculate the modulo of a hash value. It's also a simple + * way of avoiding the modulo bias effect. + */ +static inline uint32 +mod_m(uint32 val, uint64 m) +{ + Assert(m <= PG_UINT32_MAX + UINT64CONST(1)); + Assert(((m - 1) & m) == 0); + + return val & (m - 1); +} diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c new file mode 100644 index 0000000..99128b2 --- /dev/null +++ b/src/backend/lib/dshash.c @@ -0,0 +1,882 @@ +/*------------------------------------------------------------------------- + * + * dshash.c + * Concurrent hash tables backed by dynamic shared memory areas. + * + * This is an open hashing hash table, with a linked list at each table + * entry. It supports dynamic resizing, as required to prevent the linked + * lists from growing too long on average. Currently, only growing is + * supported: the hash table never becomes smaller. + * + * To deal with concurrency, it has a fixed size set of partitions, each of + * which is independently locked. Each bucket maps to a partition; so insert, + * find and iterate operations normally only acquire one lock. Therefore, + * good concurrency is achieved whenever such operations don't collide at the + * lock partition level. However, when a resize operation begins, all + * partition locks must be acquired simultaneously for a brief period. This + * is only expected to happen a small number of times until a stable size is + * found, since growth is geometric. + * + * Future versions may support iterators and incremental resizing; for now + * the implementation is minimalist. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/lib/dshash.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "common/hashfn.h" +#include "lib/dshash.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "utils/dsa.h" +#include "utils/memutils.h" + +/* + * An item in the hash table. This wraps the user's entry object in an + * envelop that holds a pointer back to the bucket and a pointer to the next + * item in the bucket. + */ +struct dshash_table_item +{ + /* The next item in the same bucket. */ + dsa_pointer next; + /* The hashed key, to avoid having to recompute it. */ + dshash_hash hash; + /* The user's entry object follows here. See ENTRY_FROM_ITEM(item). */ +}; + +/* + * The number of partitions for locking purposes. This is set to match + * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for + * the buffer pool must be good enough for any other purpose. This could + * become a runtime parameter in future. + */ +#define DSHASH_NUM_PARTITIONS_LOG2 7 +#define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2) + +/* A magic value used to identify our hash tables. */ +#define DSHASH_MAGIC 0x75ff6a20 + +/* + * Tracking information for each lock partition. Initially, each partition + * corresponds to one bucket, but each time the hash table grows, the buckets + * covered by each partition split so the number of buckets covered doubles. + * + * We might want to add padding here so that each partition is on a different + * cache line, but doing so would bloat this structure considerably. + */ +typedef struct dshash_partition +{ + LWLock lock; /* Protects all buckets in this partition. */ + size_t count; /* # of items in this partition's buckets */ +} dshash_partition; + +/* + * The head object for a hash table. This will be stored in dynamic shared + * memory. + */ +typedef struct dshash_table_control +{ + dshash_table_handle handle; + uint32 magic; + dshash_partition partitions[DSHASH_NUM_PARTITIONS]; + int lwlock_tranche_id; + + /* + * The following members are written to only when ALL partitions locks are + * held. They can be read when any one partition lock is held. + */ + + /* Number of buckets expressed as power of 2 (8 = 256 buckets). */ + size_t size_log2; /* log2(number of buckets) */ + dsa_pointer buckets; /* current bucket array */ +} dshash_table_control; + +/* + * Per-backend state for a dynamic hash table. + */ +struct dshash_table +{ + dsa_area *area; /* Backing dynamic shared memory area. */ + dshash_parameters params; /* Parameters. */ + void *arg; /* User-supplied data pointer. */ + dshash_table_control *control; /* Control object in DSM. */ + dsa_pointer *buckets; /* Current bucket pointers in DSM. */ + size_t size_log2; /* log2(number of buckets) */ +}; + +/* Given a pointer to an item, find the entry (user data) it holds. */ +#define ENTRY_FROM_ITEM(item) \ + ((char *)(item) + MAXALIGN(sizeof(dshash_table_item))) + +/* Given a pointer to an entry, find the item that holds it. */ +#define ITEM_FROM_ENTRY(entry) \ + ((dshash_table_item *)((char *)(entry) - \ + MAXALIGN(sizeof(dshash_table_item)))) + +/* How many resize operations (bucket splits) have there been? */ +#define NUM_SPLITS(size_log2) \ + (size_log2 - DSHASH_NUM_PARTITIONS_LOG2) + +/* How many buckets are there in each partition at a given size? */ +#define BUCKETS_PER_PARTITION(size_log2) \ + (((size_t) 1) << NUM_SPLITS(size_log2)) + +/* Max entries before we need to grow. Half + quarter = 75% load factor. */ +#define MAX_COUNT_PER_PARTITION(hash_table) \ + (BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \ + BUCKETS_PER_PARTITION(hash_table->size_log2) / 4) + +/* Choose partition based on the highest order bits of the hash. */ +#define PARTITION_FOR_HASH(hash) \ + (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2)) + +/* + * Find the bucket index for a given hash and table size. Each time the table + * doubles in size, the appropriate bucket for a given hash value doubles and + * possibly adds one, depending on the newly revealed bit, so that all buckets + * are split. + */ +#define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2) \ + (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2))) + +/* The index of the first bucket in a given partition. */ +#define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \ + ((partition) << NUM_SPLITS(size_log2)) + +/* The head of the active bucket for a given hash value (lvalue). */ +#define BUCKET_FOR_HASH(hash_table, hash) \ + (hash_table->buckets[ \ + BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, \ + hash_table->size_log2)]) + +static void delete_item(dshash_table *hash_table, + dshash_table_item *item); +static void resize(dshash_table *hash_table, size_t new_size); +static inline void ensure_valid_bucket_pointers(dshash_table *hash_table); +static inline dshash_table_item *find_in_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer item_pointer); +static void insert_item_into_bucket(dshash_table *hash_table, + dsa_pointer item_pointer, + dshash_table_item *item, + dsa_pointer *bucket); +static dshash_table_item *insert_into_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer *bucket); +static bool delete_key_from_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer *bucket_head); +static bool delete_item_from_bucket(dshash_table *hash_table, + dshash_table_item *item, + dsa_pointer *bucket_head); +static inline dshash_hash hash_key(dshash_table *hash_table, const void *key); +static inline bool equal_keys(dshash_table *hash_table, + const void *a, const void *b); + +#define PARTITION_LOCK(hash_table, i) \ + (&(hash_table)->control->partitions[(i)].lock) + +#define ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table) \ + Assert(!LWLockAnyHeldByMe(&(hash_table)->control->partitions[0].lock, \ + DSHASH_NUM_PARTITIONS, sizeof(dshash_partition))) + +/* + * Create a new hash table backed by the given dynamic shared area, with the + * given parameters. The returned object is allocated in backend-local memory + * using the current MemoryContext. 'arg' will be passed through to the + * compare and hash functions. + */ +dshash_table * +dshash_create(dsa_area *area, const dshash_parameters *params, void *arg) +{ + dshash_table *hash_table; + dsa_pointer control; + + /* Allocate the backend-local object representing the hash table. */ + hash_table = palloc(sizeof(dshash_table)); + + /* Allocate the control object in shared memory. */ + control = dsa_allocate(area, sizeof(dshash_table_control)); + + /* Set up the local and shared hash table structs. */ + hash_table->area = area; + hash_table->params = *params; + hash_table->arg = arg; + hash_table->control = dsa_get_address(area, control); + hash_table->control->handle = control; + hash_table->control->magic = DSHASH_MAGIC; + hash_table->control->lwlock_tranche_id = params->tranche_id; + + /* Set up the array of lock partitions. */ + { + dshash_partition *partitions = hash_table->control->partitions; + int tranche_id = hash_table->control->lwlock_tranche_id; + int i; + + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + { + LWLockInitialize(&partitions[i].lock, tranche_id); + partitions[i].count = 0; + } + } + + /* + * Set up the initial array of buckets. Our initial size is the same as + * the number of partitions. + */ + hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2; + hash_table->control->buckets = + dsa_allocate_extended(area, + sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS, + DSA_ALLOC_NO_OOM | DSA_ALLOC_ZERO); + if (!DsaPointerIsValid(hash_table->control->buckets)) + { + dsa_free(area, control); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on DSA request of size %zu.", + sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS))); + } + hash_table->buckets = dsa_get_address(area, + hash_table->control->buckets); + hash_table->size_log2 = hash_table->control->size_log2; + + return hash_table; +} + +/* + * Attach to an existing hash table using a handle. The returned object is + * allocated in backend-local memory using the current MemoryContext. 'arg' + * will be passed through to the compare and hash functions. + */ +dshash_table * +dshash_attach(dsa_area *area, const dshash_parameters *params, + dshash_table_handle handle, void *arg) +{ + dshash_table *hash_table; + dsa_pointer control; + + /* Allocate the backend-local object representing the hash table. */ + hash_table = palloc(sizeof(dshash_table)); + + /* Find the control object in shared memory. */ + control = handle; + + /* Set up the local hash table struct. */ + hash_table->area = area; + hash_table->params = *params; + hash_table->arg = arg; + hash_table->control = dsa_get_address(area, control); + Assert(hash_table->control->magic == DSHASH_MAGIC); + + /* + * These will later be set to the correct values by + * ensure_valid_bucket_pointers(), at which time we'll be holding a + * partition lock for interlocking against concurrent resizing. + */ + hash_table->buckets = NULL; + hash_table->size_log2 = 0; + + return hash_table; +} + +/* + * Detach from a hash table. This frees backend-local resources associated + * with the hash table, but the hash table will continue to exist until it is + * either explicitly destroyed (by a backend that is still attached to it), or + * the area that backs it is returned to the operating system. + */ +void +dshash_detach(dshash_table *hash_table) +{ + ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table); + + /* The hash table may have been destroyed. Just free local memory. */ + pfree(hash_table); +} + +/* + * Destroy a hash table, returning all memory to the area. The caller must be + * certain that no other backend will attempt to access the hash table before + * calling this function. Other backend must explicitly call dshash_detach to + * free up backend-local memory associated with the hash table. The backend + * that calls dshash_destroy must not call dshash_detach. + */ +void +dshash_destroy(dshash_table *hash_table) +{ + size_t size; + size_t i; + + Assert(hash_table->control->magic == DSHASH_MAGIC); + ensure_valid_bucket_pointers(hash_table); + + /* Free all the entries. */ + size = ((size_t) 1) << hash_table->size_log2; + for (i = 0; i < size; ++i) + { + dsa_pointer item_pointer = hash_table->buckets[i]; + + while (DsaPointerIsValid(item_pointer)) + { + dshash_table_item *item; + dsa_pointer next_item_pointer; + + item = dsa_get_address(hash_table->area, item_pointer); + next_item_pointer = item->next; + dsa_free(hash_table->area, item_pointer); + item_pointer = next_item_pointer; + } + } + + /* + * Vandalize the control block to help catch programming errors where + * other backends access the memory formerly occupied by this hash table. + */ + hash_table->control->magic = 0; + + /* Free the active table and control object. */ + dsa_free(hash_table->area, hash_table->control->buckets); + dsa_free(hash_table->area, hash_table->control->handle); + + pfree(hash_table); +} + +/* + * Get a handle that can be used by other processes to attach to this hash + * table. + */ +dshash_table_handle +dshash_get_hash_table_handle(dshash_table *hash_table) +{ + Assert(hash_table->control->magic == DSHASH_MAGIC); + + return hash_table->control->handle; +} + +/* + * Look up an entry, given a key. Returns a pointer to an entry if one can be + * found with the given key. Returns NULL if the key is not found. If a + * non-NULL value is returned, the entry is locked and must be released by + * calling dshash_release_lock. If an error is raised before + * dshash_release_lock is called, the lock will be released automatically, but + * the caller must take care to ensure that the entry is not left corrupted. + * The lock mode is either shared or exclusive depending on 'exclusive'. + * + * The caller must not hold a lock already. + * + * Note that the lock held is in fact an LWLock, so interrupts will be held on + * return from this function, and not resumed until dshash_release_lock is + * called. It is a very good idea for the caller to release the lock quickly. + */ +void * +dshash_find(dshash_table *hash_table, const void *key, bool exclusive) +{ + dshash_hash hash; + size_t partition; + dshash_table_item *item; + + hash = hash_key(hash_table, key); + partition = PARTITION_FOR_HASH(hash); + + Assert(hash_table->control->magic == DSHASH_MAGIC); + ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table); + + LWLockAcquire(PARTITION_LOCK(hash_table, partition), + exclusive ? LW_EXCLUSIVE : LW_SHARED); + ensure_valid_bucket_pointers(hash_table); + + /* Search the active bucket. */ + item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash)); + + if (!item) + { + /* Not found. */ + LWLockRelease(PARTITION_LOCK(hash_table, partition)); + return NULL; + } + else + { + /* The caller will free the lock by calling dshash_release_lock. */ + return ENTRY_FROM_ITEM(item); + } +} + +/* + * Returns a pointer to an exclusively locked item which must be released with + * dshash_release_lock. If the key is found in the hash table, 'found' is set + * to true and a pointer to the existing entry is returned. If the key is not + * found, 'found' is set to false, and a pointer to a newly created entry is + * returned. + * + * Notes above dshash_find() regarding locking and error handling equally + * apply here. + */ +void * +dshash_find_or_insert(dshash_table *hash_table, + const void *key, + bool *found) +{ + dshash_hash hash; + size_t partition_index; + dshash_partition *partition; + dshash_table_item *item; + + hash = hash_key(hash_table, key); + partition_index = PARTITION_FOR_HASH(hash); + partition = &hash_table->control->partitions[partition_index]; + + Assert(hash_table->control->magic == DSHASH_MAGIC); + ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table); + +restart: + LWLockAcquire(PARTITION_LOCK(hash_table, partition_index), + LW_EXCLUSIVE); + ensure_valid_bucket_pointers(hash_table); + + /* Search the active bucket. */ + item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash)); + + if (item) + *found = true; + else + { + *found = false; + + /* Check if we are getting too full. */ + if (partition->count > MAX_COUNT_PER_PARTITION(hash_table)) + { + /* + * The load factor (= keys / buckets) for all buckets protected by + * this partition is > 0.75. Presumably the same applies + * generally across the whole hash table (though we don't attempt + * to track that directly to avoid contention on some kind of + * central counter; we just assume that this partition is + * representative). This is a good time to resize. + * + * Give up our existing lock first, because resizing needs to + * reacquire all the locks in the right order to avoid deadlocks. + */ + LWLockRelease(PARTITION_LOCK(hash_table, partition_index)); + resize(hash_table, hash_table->size_log2 + 1); + + goto restart; + } + + /* Finally we can try to insert the new item. */ + item = insert_into_bucket(hash_table, key, + &BUCKET_FOR_HASH(hash_table, hash)); + item->hash = hash; + /* Adjust per-lock-partition counter for load factor knowledge. */ + ++partition->count; + } + + /* The caller must release the lock with dshash_release_lock. */ + return ENTRY_FROM_ITEM(item); +} + +/* + * Remove an entry by key. Returns true if the key was found and the + * corresponding entry was removed. + * + * To delete an entry that you already have a pointer to, see + * dshash_delete_entry. + */ +bool +dshash_delete_key(dshash_table *hash_table, const void *key) +{ + dshash_hash hash; + size_t partition; + bool found; + + Assert(hash_table->control->magic == DSHASH_MAGIC); + ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table); + + hash = hash_key(hash_table, key); + partition = PARTITION_FOR_HASH(hash); + + LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE); + ensure_valid_bucket_pointers(hash_table); + + if (delete_key_from_bucket(hash_table, key, + &BUCKET_FOR_HASH(hash_table, hash))) + { + Assert(hash_table->control->partitions[partition].count > 0); + found = true; + --hash_table->control->partitions[partition].count; + } + else + found = false; + + LWLockRelease(PARTITION_LOCK(hash_table, partition)); + + return found; +} + +/* + * Remove an entry. The entry must already be exclusively locked, and must + * have been obtained by dshash_find or dshash_find_or_insert. Note that this + * function releases the lock just like dshash_release_lock. + * + * To delete an entry by key, see dshash_delete_key. + */ +void +dshash_delete_entry(dshash_table *hash_table, void *entry) +{ + dshash_table_item *item = ITEM_FROM_ENTRY(entry); + size_t partition = PARTITION_FOR_HASH(item->hash); + + Assert(hash_table->control->magic == DSHASH_MAGIC); + Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition), + LW_EXCLUSIVE)); + + delete_item(hash_table, item); + LWLockRelease(PARTITION_LOCK(hash_table, partition)); +} + +/* + * Unlock an entry which was locked by dshash_find or dshash_find_or_insert. + */ +void +dshash_release_lock(dshash_table *hash_table, void *entry) +{ + dshash_table_item *item = ITEM_FROM_ENTRY(entry); + size_t partition_index = PARTITION_FOR_HASH(item->hash); + + Assert(hash_table->control->magic == DSHASH_MAGIC); + + LWLockRelease(PARTITION_LOCK(hash_table, partition_index)); +} + +/* + * A compare function that forwards to memcmp. + */ +int +dshash_memcmp(const void *a, const void *b, size_t size, void *arg) +{ + return memcmp(a, b, size); +} + +/* + * A hash function that forwards to tag_hash. + */ +dshash_hash +dshash_memhash(const void *v, size_t size, void *arg) +{ + return tag_hash(v, size); +} + +/* + * Print debugging information about the internal state of the hash table to + * stderr. The caller must hold no partition locks. + */ +void +dshash_dump(dshash_table *hash_table) +{ + size_t i; + size_t j; + + Assert(hash_table->control->magic == DSHASH_MAGIC); + ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table); + + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + { + Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i))); + LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED); + } + + ensure_valid_bucket_pointers(hash_table); + + fprintf(stderr, + "hash table size = %zu\n", (size_t) 1 << hash_table->size_log2); + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + { + dshash_partition *partition = &hash_table->control->partitions[i]; + size_t begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2); + size_t end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2); + + fprintf(stderr, " partition %zu\n", i); + fprintf(stderr, + " active buckets (key count = %zu)\n", partition->count); + + for (j = begin; j < end; ++j) + { + size_t count = 0; + dsa_pointer bucket = hash_table->buckets[j]; + + while (DsaPointerIsValid(bucket)) + { + dshash_table_item *item; + + item = dsa_get_address(hash_table->area, bucket); + + bucket = item->next; + ++count; + } + fprintf(stderr, " bucket %zu (key count = %zu)\n", j, count); + } + } + + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + LWLockRelease(PARTITION_LOCK(hash_table, i)); +} + +/* + * Delete a locked item to which we have a pointer. + */ +static void +delete_item(dshash_table *hash_table, dshash_table_item *item) +{ + size_t hash = item->hash; + size_t partition = PARTITION_FOR_HASH(hash); + + Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition))); + + if (delete_item_from_bucket(hash_table, item, + &BUCKET_FOR_HASH(hash_table, hash))) + { + Assert(hash_table->control->partitions[partition].count > 0); + --hash_table->control->partitions[partition].count; + } + else + { + Assert(false); + } +} + +/* + * Grow the hash table if necessary to the requested number of buckets. The + * requested size must be double some previously observed size. + * + * Must be called without any partition lock held. + */ +static void +resize(dshash_table *hash_table, size_t new_size_log2) +{ + dsa_pointer old_buckets; + dsa_pointer new_buckets_shared; + dsa_pointer *new_buckets; + size_t size; + size_t new_size = ((size_t) 1) << new_size_log2; + size_t i; + + /* + * Acquire the locks for all lock partitions. This is expensive, but we + * shouldn't have to do it many times. + */ + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + { + Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i))); + + LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE); + if (i == 0 && hash_table->control->size_log2 >= new_size_log2) + { + /* + * Another backend has already increased the size; we can avoid + * obtaining all the locks and return early. + */ + LWLockRelease(PARTITION_LOCK(hash_table, 0)); + return; + } + } + + Assert(new_size_log2 == hash_table->control->size_log2 + 1); + + /* Allocate the space for the new table. */ + new_buckets_shared = dsa_allocate0(hash_table->area, + sizeof(dsa_pointer) * new_size); + new_buckets = dsa_get_address(hash_table->area, new_buckets_shared); + + /* + * We've allocated the new bucket array; all that remains to do now is to + * reinsert all items, which amounts to adjusting all the pointers. + */ + size = ((size_t) 1) << hash_table->control->size_log2; + for (i = 0; i < size; ++i) + { + dsa_pointer item_pointer = hash_table->buckets[i]; + + while (DsaPointerIsValid(item_pointer)) + { + dshash_table_item *item; + dsa_pointer next_item_pointer; + + item = dsa_get_address(hash_table->area, item_pointer); + next_item_pointer = item->next; + insert_item_into_bucket(hash_table, item_pointer, item, + &new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash, + new_size_log2)]); + item_pointer = next_item_pointer; + } + } + + /* Swap the hash table into place and free the old one. */ + old_buckets = hash_table->control->buckets; + hash_table->control->buckets = new_buckets_shared; + hash_table->control->size_log2 = new_size_log2; + hash_table->buckets = new_buckets; + dsa_free(hash_table->area, old_buckets); + + /* Release all the locks. */ + for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i) + LWLockRelease(PARTITION_LOCK(hash_table, i)); +} + +/* + * Make sure that our backend-local bucket pointers are up to date. The + * caller must have locked one lock partition, which prevents resize() from + * running concurrently. + */ +static inline void +ensure_valid_bucket_pointers(dshash_table *hash_table) +{ + if (hash_table->size_log2 != hash_table->control->size_log2) + { + hash_table->buckets = dsa_get_address(hash_table->area, + hash_table->control->buckets); + hash_table->size_log2 = hash_table->control->size_log2; + } +} + +/* + * Scan a locked bucket for a match, using the provided compare function. + */ +static inline dshash_table_item * +find_in_bucket(dshash_table *hash_table, const void *key, + dsa_pointer item_pointer) +{ + while (DsaPointerIsValid(item_pointer)) + { + dshash_table_item *item; + + item = dsa_get_address(hash_table->area, item_pointer); + if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item))) + return item; + item_pointer = item->next; + } + return NULL; +} + +/* + * Insert an already-allocated item into a bucket. + */ +static void +insert_item_into_bucket(dshash_table *hash_table, + dsa_pointer item_pointer, + dshash_table_item *item, + dsa_pointer *bucket) +{ + Assert(item == dsa_get_address(hash_table->area, item_pointer)); + + item->next = *bucket; + *bucket = item_pointer; +} + +/* + * Allocate space for an entry with the given key and insert it into the + * provided bucket. + */ +static dshash_table_item * +insert_into_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer *bucket) +{ + dsa_pointer item_pointer; + dshash_table_item *item; + + item_pointer = dsa_allocate(hash_table->area, + hash_table->params.entry_size + + MAXALIGN(sizeof(dshash_table_item))); + item = dsa_get_address(hash_table->area, item_pointer); + memcpy(ENTRY_FROM_ITEM(item), key, hash_table->params.key_size); + insert_item_into_bucket(hash_table, item_pointer, item, bucket); + return item; +} + +/* + * Search a bucket for a matching key and delete it. + */ +static bool +delete_key_from_bucket(dshash_table *hash_table, + const void *key, + dsa_pointer *bucket_head) +{ + while (DsaPointerIsValid(*bucket_head)) + { + dshash_table_item *item; + + item = dsa_get_address(hash_table->area, *bucket_head); + + if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item))) + { + dsa_pointer next; + + next = item->next; + dsa_free(hash_table->area, *bucket_head); + *bucket_head = next; + + return true; + } + bucket_head = &item->next; + } + return false; +} + +/* + * Delete the specified item from the bucket. + */ +static bool +delete_item_from_bucket(dshash_table *hash_table, + dshash_table_item *item, + dsa_pointer *bucket_head) +{ + while (DsaPointerIsValid(*bucket_head)) + { + dshash_table_item *bucket_item; + + bucket_item = dsa_get_address(hash_table->area, *bucket_head); + + if (bucket_item == item) + { + dsa_pointer next; + + next = item->next; + dsa_free(hash_table->area, *bucket_head); + *bucket_head = next; + return true; + } + bucket_head = &bucket_item->next; + } + return false; +} + +/* + * Compute the hash value for a key. + */ +static inline dshash_hash +hash_key(dshash_table *hash_table, const void *key) +{ + return hash_table->params.hash_function(key, + hash_table->params.key_size, + hash_table->arg); +} + +/* + * Check whether two keys compare equal. + */ +static inline bool +equal_keys(dshash_table *hash_table, const void *a, const void *b) +{ + return hash_table->params.compare_function(a, b, + hash_table->params.key_size, + hash_table->arg) == 0; +} diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c new file mode 100644 index 0000000..f4e0241 --- /dev/null +++ b/src/backend/lib/hyperloglog.c @@ -0,0 +1,255 @@ +/*------------------------------------------------------------------------- + * + * hyperloglog.c + * HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2021, PostgreSQL Global Development Group + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "postgres.h" + +#include <math.h> + +#include "lib/hyperloglog.h" +#include "port/pg_bitutils.h" + +#define POW_2_32 (4294967296.0) +#define NEG_POW_2_32 (-4294967296.0) + +static inline uint8 rho(uint32 x, uint8 b); + +/* + * Initialize HyperLogLog track state, by bit width + * + * bwidth is bit width (so register size will be 2 to the power of bwidth). + * Must be between 4 and 16 inclusive. + */ +void +initHyperLogLog(hyperLogLogState *cState, uint8 bwidth) +{ + double alpha; + + if (bwidth < 4 || bwidth > 16) + elog(ERROR, "bit width must be between 4 and 16 inclusive"); + + cState->registerWidth = bwidth; + cState->nRegisters = (Size) 1 << bwidth; + cState->arrSize = sizeof(uint8) * cState->nRegisters + 1; + + /* + * Initialize hashes array to zero, not negative infinity, per discussion + * of the coupon collector problem in the HyperLogLog paper + */ + cState->hashesArr = palloc0(cState->arrSize); + + /* + * "alpha" is a value that for each possible number of registers (m) is + * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z + * is "the indicator function" through which we finally compute E, + * estimated cardinality). + */ + switch (cState->nRegisters) + { + case 16: + alpha = 0.673; + break; + case 32: + alpha = 0.697; + break; + case 64: + alpha = 0.709; + break; + default: + alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters); + } + + /* + * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog + * estimate E + */ + cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters; +} + +/* + * Initialize HyperLogLog track state, by error rate + * + * Instead of specifying bwidth (number of bits used for addressing the + * register), this method allows sizing the counter for particular error + * rate using a simple formula from the paper: + * + * e = 1.04 / sqrt(m) + * + * where 'm' is the number of registers, i.e. (2^bwidth). The method + * finds the lowest bwidth with 'e' below the requested error rate, and + * then uses it to initialize the counter. + * + * As bwidth has to be between 4 and 16, the worst possible error rate + * is between ~25% (bwidth=4) and 0.4% (bwidth=16). + */ +void +initHyperLogLogError(hyperLogLogState *cState, double error) +{ + uint8 bwidth = 4; + + while (bwidth < 16) + { + double m = (Size) 1 << bwidth; + + if (1.04 / sqrt(m) < error) + break; + bwidth++; + } + + initHyperLogLog(cState, bwidth); +} + +/* + * Free HyperLogLog track state + * + * Releases allocated resources, but not the state itself (in case it's not + * allocated by palloc). + */ +void +freeHyperLogLog(hyperLogLogState *cState) +{ + Assert(cState->hashesArr != NULL); + pfree(cState->hashesArr); +} + +/* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ +void +addHyperLogLog(hyperLogLogState *cState, uint32 hash) +{ + uint8 count; + uint32 index; + + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth); + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << cState->registerWidth, + BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth); + + cState->hashesArr[index] = Max(count, cState->hashesArr[index]); +} + +/* + * Estimates cardinality, based on elements added so far + */ +double +estimateHyperLogLog(hyperLogLogState *cState) +{ + double result; + double sum = 0.0; + int i; + + for (i = 0; i < cState->nRegisters; i++) + { + sum += 1.0 / pow(2.0, cState->hashesArr[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = cState->alphaMM / sum; + + if (result <= (5.0 / 2.0) * cState->nRegisters) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < cState->nRegisters; i++) + { + if (cState->hashesArr[i] == 0) + zero_count++; + } + + if (zero_count != 0) + result = cState->nRegisters * log((double) cState->nRegisters / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; +} + +/* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ +static inline uint8 +rho(uint32 x, uint8 b) +{ + uint8 j = 1; + + if (x == 0) + return b + 1; + + j = 32 - pg_leftmost_one_pos32(x); + + if (j > b) + return b + 1; + + return j; +} diff --git a/src/backend/lib/ilist.c b/src/backend/lib/ilist.c new file mode 100644 index 0000000..e9a07c1 --- /dev/null +++ b/src/backend/lib/ilist.c @@ -0,0 +1,111 @@ +/*------------------------------------------------------------------------- + * + * ilist.c + * support for integrated/inline doubly- and singly- linked lists + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/lib/ilist.c + * + * NOTES + * This file only contains functions that are too big to be considered + * for inlining. See ilist.h for most of the goodies. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "lib/ilist.h" + +/* + * Delete 'node' from list. + * + * It is not allowed to delete a 'node' which is not in the list 'head' + * + * Caution: this is O(n); consider using slist_delete_current() instead. + */ +void +slist_delete(slist_head *head, slist_node *node) +{ + slist_node *last = &head->head; + slist_node *cur; + bool found PG_USED_FOR_ASSERTS_ONLY = false; + + while ((cur = last->next) != NULL) + { + if (cur == node) + { + last->next = cur->next; +#ifdef USE_ASSERT_CHECKING + found = true; +#endif + break; + } + last = cur; + } + Assert(found); + + slist_check(head); +} + +#ifdef ILIST_DEBUG +/* + * Verify integrity of a doubly linked list + */ +void +dlist_check(dlist_head *head) +{ + dlist_node *cur; + + if (head == NULL) + elog(ERROR, "doubly linked list head address is NULL"); + + if (head->head.next == NULL && head->head.prev == NULL) + return; /* OK, initialized as zeroes */ + + /* iterate in forward direction */ + for (cur = head->head.next; cur != &head->head; cur = cur->next) + { + if (cur == NULL || + cur->next == NULL || + cur->prev == NULL || + cur->prev->next != cur || + cur->next->prev != cur) + elog(ERROR, "doubly linked list is corrupted"); + } + + /* iterate in backward direction */ + for (cur = head->head.prev; cur != &head->head; cur = cur->prev) + { + if (cur == NULL || + cur->next == NULL || + cur->prev == NULL || + cur->prev->next != cur || + cur->next->prev != cur) + elog(ERROR, "doubly linked list is corrupted"); + } +} + +/* + * Verify integrity of a singly linked list + */ +void +slist_check(slist_head *head) +{ + slist_node *cur; + + if (head == NULL) + elog(ERROR, "singly linked list head address is NULL"); + + /* + * there isn't much we can test in a singly linked list except that it + * actually ends sometime, i.e. hasn't introduced a cycle or similar + */ + for (cur = head->head.next; cur != NULL; cur = cur->next) + ; +} + +#endif /* ILIST_DEBUG */ diff --git a/src/backend/lib/integerset.c b/src/backend/lib/integerset.c new file mode 100644 index 0000000..278a91b --- /dev/null +++ b/src/backend/lib/integerset.c @@ -0,0 +1,1045 @@ +/*------------------------------------------------------------------------- + * + * integerset.c + * Data structure to hold a large set of 64-bit integers efficiently + * + * IntegerSet provides an in-memory data structure to hold a set of + * arbitrary 64-bit integers. Internally, the values are stored in a + * B-tree, with a special packed representation at the leaf level using + * the Simple-8b algorithm, which can pack clusters of nearby values + * very tightly. + * + * Memory consumption depends on the number of values stored, but also + * on how far the values are from each other. In the best case, with + * long runs of consecutive integers, memory consumption can be as low as + * 0.1 bytes per integer. In the worst case, if integers are more than + * 2^32 apart, it uses about 8 bytes per integer. In typical use, the + * consumption per integer is somewhere between those extremes, depending + * on the range of integers stored, and how "clustered" they are. + * + * + * Interface + * --------- + * + * intset_create - Create a new, empty set + * intset_add_member - Add an integer to the set + * intset_is_member - Test if an integer is in the set + * intset_begin_iterate - Begin iterating through all integers in set + * intset_iterate_next - Return next set member, if any + * + * intset_create() creates the set in the current memory context. Subsequent + * operations that add to the data structure will continue to allocate from + * that same context, even if it's not current anymore. + * + * Note that there is no function to free an integer set. If you need to do + * that, create a dedicated memory context to hold it, and destroy the memory + * context instead. + * + * + * Limitations + * ----------- + * + * - Values must be added in order. (Random insertions would require + * splitting nodes, which hasn't been implemented.) + * + * - Values cannot be added while iteration is in progress. + * + * - No support for removing values. + * + * None of these limitations are fundamental to the data structure, so they + * could be lifted if needed, by writing some new code. But the current + * users of this facility don't need them. + * + * + * References + * ---------- + * + * Simple-8b encoding is based on: + * + * Vo Ngoc Anh, Alistair Moffat, Index compression using 64-bit words, + * Software - Practice & Experience, v.40 n.2, p.131-147, February 2010 + * (https://doi.org/10.1002/spe.948) + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/lib/integerset.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "lib/integerset.h" +#include "port/pg_bitutils.h" +#include "utils/memutils.h" + + +/* + * Maximum number of integers that can be encoded in a single Simple-8b + * codeword. (Defined here before anything else, so that we can size arrays + * using this.) + */ +#define SIMPLE8B_MAX_VALUES_PER_CODEWORD 240 + +/* + * Parameters for shape of the in-memory B-tree. + * + * These set the size of each internal and leaf node. They don't necessarily + * need to be the same, because the tree is just an in-memory structure. + * With the default 64, each node is about 1 kb. + * + * If you change these, you must recalculate MAX_TREE_LEVELS, too! + */ +#define MAX_INTERNAL_ITEMS 64 +#define MAX_LEAF_ITEMS 64 + +/* + * Maximum height of the tree. + * + * MAX_TREE_ITEMS is calculated from the "fan-out" of the B-tree. The + * theoretical maximum number of items that we can store in a set is 2^64, + * so MAX_TREE_LEVELS should be set so that: + * + * MAX_LEAF_ITEMS * MAX_INTERNAL_ITEMS ^ (MAX_TREE_LEVELS - 1) >= 2^64. + * + * In practice, we'll need far fewer levels, because you will run out of + * memory long before reaching that number, but let's be conservative. + */ +#define MAX_TREE_LEVELS 11 + +/* + * Node structures, for the in-memory B-tree. + * + * An internal node holds a number of downlink pointers to leaf nodes, or + * to internal nodes on a lower level. For each downlink, the key value + * corresponding to the lower level node is stored in a sorted array. The + * stored key values are low keys. In other words, if the downlink has value + * X, then all items stored on that child are >= X. + * + * Each leaf node holds a number of "items", with a varying number of + * integers packed into each item. Each item consists of two 64-bit words: + * The first word holds the first integer stored in the item, in plain format. + * The second word contains between 0 and 240 more integers, packed using + * Simple-8b encoding. By storing the first integer in plain, unpacked, + * format, we can use binary search to quickly find an item that holds (or + * would hold) a particular integer. And by storing the rest in packed form, + * we still get pretty good memory density, if there are clusters of integers + * with similar values. + * + * Each leaf node also has a pointer to the next leaf node, so that the leaf + * nodes can be easily walked from beginning to end when iterating. + */ +typedef struct intset_node intset_node; +typedef struct intset_leaf_node intset_leaf_node; +typedef struct intset_internal_node intset_internal_node; + +/* Common structure of both leaf and internal nodes. */ +struct intset_node +{ + uint16 level; /* tree level of this node */ + uint16 num_items; /* number of items in this node */ +}; + +/* Internal node */ +struct intset_internal_node +{ + /* common header, must match intset_node */ + uint16 level; /* >= 1 on internal nodes */ + uint16 num_items; + + /* + * 'values' is an array of key values, and 'downlinks' are pointers to + * lower-level nodes, corresponding to the key values. + */ + uint64 values[MAX_INTERNAL_ITEMS]; + intset_node *downlinks[MAX_INTERNAL_ITEMS]; +}; + +/* Leaf node */ +typedef struct +{ + uint64 first; /* first integer in this item */ + uint64 codeword; /* simple8b encoded differences from 'first' */ +} leaf_item; + +#define MAX_VALUES_PER_LEAF_ITEM (1 + SIMPLE8B_MAX_VALUES_PER_CODEWORD) + +struct intset_leaf_node +{ + /* common header, must match intset_node */ + uint16 level; /* 0 on leafs */ + uint16 num_items; + + intset_leaf_node *next; /* right sibling, if any */ + + leaf_item items[MAX_LEAF_ITEMS]; +}; + +/* + * We buffer insertions in a simple array, before packing and inserting them + * into the B-tree. MAX_BUFFERED_VALUES sets the size of the buffer. The + * encoder assumes that it is large enough that we can always fill a leaf + * item with buffered new items. In other words, MAX_BUFFERED_VALUES must be + * larger than MAX_VALUES_PER_LEAF_ITEM. For efficiency, make it much larger. + */ +#define MAX_BUFFERED_VALUES (MAX_VALUES_PER_LEAF_ITEM * 2) + +/* + * IntegerSet is the top-level object representing the set. + * + * The integers are stored in an in-memory B-tree structure, plus an array + * for newly-added integers. IntegerSet also tracks information about memory + * usage, as well as the current position when iterating the set with + * intset_begin_iterate / intset_iterate_next. + */ +struct IntegerSet +{ + /* + * 'context' is the memory context holding this integer set and all its + * tree nodes. + * + * 'mem_used' tracks the amount of memory used. We don't do anything with + * it in integerset.c itself, but the callers can ask for it with + * intset_memory_usage(). + */ + MemoryContext context; + uint64 mem_used; + + uint64 num_entries; /* total # of values in the set */ + uint64 highest_value; /* highest value stored in this set */ + + /* + * B-tree to hold the packed values. + * + * 'rightmost_nodes' hold pointers to the rightmost node on each level. + * rightmost_parent[0] is rightmost leaf, rightmost_parent[1] is its + * parent, and so forth, all the way up to the root. These are needed when + * adding new values. (Currently, we require that new values are added at + * the end.) + */ + int num_levels; /* height of the tree */ + intset_node *root; /* root node */ + intset_node *rightmost_nodes[MAX_TREE_LEVELS]; + intset_leaf_node *leftmost_leaf; /* leftmost leaf node */ + + /* + * Holding area for new items that haven't been inserted to the tree yet. + */ + uint64 buffered_values[MAX_BUFFERED_VALUES]; + int num_buffered_values; + + /* + * Iterator support. + * + * 'iter_values' is an array of integers ready to be returned to the + * caller; 'iter_num_values' is the length of that array, and + * 'iter_valueno' is the next index. 'iter_node' and 'iter_itemno' point + * to the leaf node, and item within the leaf node, to get the next batch + * of values from. + * + * Normally, 'iter_values' points to 'iter_values_buf', which holds items + * decoded from a leaf item. But after we have scanned the whole B-tree, + * we iterate through all the unbuffered values, too, by pointing + * iter_values to 'buffered_values'. + */ + bool iter_active; /* is iteration in progress? */ + + const uint64 *iter_values; + int iter_num_values; /* number of elements in 'iter_values' */ + int iter_valueno; /* next index into 'iter_values' */ + + intset_leaf_node *iter_node; /* current leaf node */ + int iter_itemno; /* next item in 'iter_node' to decode */ + + uint64 iter_values_buf[MAX_VALUES_PER_LEAF_ITEM]; +}; + +/* + * Prototypes for internal functions. + */ +static void intset_update_upper(IntegerSet *intset, int level, + intset_node *child, uint64 child_key); +static void intset_flush_buffered_values(IntegerSet *intset); + +static int intset_binsrch_uint64(uint64 value, uint64 *arr, int arr_elems, + bool nextkey); +static int intset_binsrch_leaf(uint64 value, leaf_item *arr, int arr_elems, + bool nextkey); + +static uint64 simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base); +static int simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base); +static bool simple8b_contains(uint64 codeword, uint64 key, uint64 base); + + +/* + * Create a new, initially empty, integer set. + * + * The integer set is created in the current memory context. + * We will do all subsequent allocations in the same context, too, regardless + * of which memory context is current when new integers are added to the set. + */ +IntegerSet * +intset_create(void) +{ + IntegerSet *intset; + + intset = (IntegerSet *) palloc(sizeof(IntegerSet)); + intset->context = CurrentMemoryContext; + intset->mem_used = GetMemoryChunkSpace(intset); + + intset->num_entries = 0; + intset->highest_value = 0; + + intset->num_levels = 0; + intset->root = NULL; + memset(intset->rightmost_nodes, 0, sizeof(intset->rightmost_nodes)); + intset->leftmost_leaf = NULL; + + intset->num_buffered_values = 0; + + intset->iter_active = false; + intset->iter_node = NULL; + intset->iter_itemno = 0; + intset->iter_valueno = 0; + intset->iter_num_values = 0; + intset->iter_values = NULL; + + return intset; +} + +/* + * Allocate a new node. + */ +static intset_internal_node * +intset_new_internal_node(IntegerSet *intset) +{ + intset_internal_node *n; + + n = (intset_internal_node *) MemoryContextAlloc(intset->context, + sizeof(intset_internal_node)); + intset->mem_used += GetMemoryChunkSpace(n); + + n->level = 0; /* caller must set */ + n->num_items = 0; + + return n; +} + +static intset_leaf_node * +intset_new_leaf_node(IntegerSet *intset) +{ + intset_leaf_node *n; + + n = (intset_leaf_node *) MemoryContextAlloc(intset->context, + sizeof(intset_leaf_node)); + intset->mem_used += GetMemoryChunkSpace(n); + + n->level = 0; + n->num_items = 0; + n->next = NULL; + + return n; +} + +/* + * Return the number of entries in the integer set. + */ +uint64 +intset_num_entries(IntegerSet *intset) +{ + return intset->num_entries; +} + +/* + * Return the amount of memory used by the integer set. + */ +uint64 +intset_memory_usage(IntegerSet *intset) +{ + return intset->mem_used; +} + +/* + * Add a value to the set. + * + * Values must be added in order. + */ +void +intset_add_member(IntegerSet *intset, uint64 x) +{ + if (intset->iter_active) + elog(ERROR, "cannot add new values to integer set while iteration is in progress"); + + if (x <= intset->highest_value && intset->num_entries > 0) + elog(ERROR, "cannot add value to integer set out of order"); + + if (intset->num_buffered_values >= MAX_BUFFERED_VALUES) + { + /* Time to flush our buffer */ + intset_flush_buffered_values(intset); + Assert(intset->num_buffered_values < MAX_BUFFERED_VALUES); + } + + /* Add it to the buffer of newly-added values */ + intset->buffered_values[intset->num_buffered_values] = x; + intset->num_buffered_values++; + intset->num_entries++; + intset->highest_value = x; +} + +/* + * Take a batch of buffered values, and pack them into the B-tree. + */ +static void +intset_flush_buffered_values(IntegerSet *intset) +{ + uint64 *values = intset->buffered_values; + uint64 num_values = intset->num_buffered_values; + int num_packed = 0; + intset_leaf_node *leaf; + + leaf = (intset_leaf_node *) intset->rightmost_nodes[0]; + + /* + * If the tree is completely empty, create the first leaf page, which is + * also the root. + */ + if (leaf == NULL) + { + /* + * This is the very first item in the set. + * + * Allocate root node. It's also a leaf. + */ + leaf = intset_new_leaf_node(intset); + + intset->root = (intset_node *) leaf; + intset->leftmost_leaf = leaf; + intset->rightmost_nodes[0] = (intset_node *) leaf; + intset->num_levels = 1; + } + + /* + * If there are less than MAX_VALUES_PER_LEAF_ITEM values in the buffer, + * stop. In most cases, we cannot encode that many values in a single + * value, but this way, the encoder doesn't have to worry about running + * out of input. + */ + while (num_values - num_packed >= MAX_VALUES_PER_LEAF_ITEM) + { + leaf_item item; + int num_encoded; + + /* + * Construct the next leaf item, packing as many buffered values as + * possible. + */ + item.first = values[num_packed]; + item.codeword = simple8b_encode(&values[num_packed + 1], + &num_encoded, + item.first); + + /* + * Add the item to the node, allocating a new node if the old one is + * full. + */ + if (leaf->num_items >= MAX_LEAF_ITEMS) + { + /* Allocate new leaf and link it to the tree */ + intset_leaf_node *old_leaf = leaf; + + leaf = intset_new_leaf_node(intset); + old_leaf->next = leaf; + intset->rightmost_nodes[0] = (intset_node *) leaf; + intset_update_upper(intset, 1, (intset_node *) leaf, item.first); + } + leaf->items[leaf->num_items++] = item; + + num_packed += 1 + num_encoded; + } + + /* + * Move any remaining buffered values to the beginning of the array. + */ + if (num_packed < intset->num_buffered_values) + { + memmove(&intset->buffered_values[0], + &intset->buffered_values[num_packed], + (intset->num_buffered_values - num_packed) * sizeof(uint64)); + } + intset->num_buffered_values -= num_packed; +} + +/* + * Insert a downlink into parent node, after creating a new node. + * + * Recurses if the parent node is full, too. + */ +static void +intset_update_upper(IntegerSet *intset, int level, intset_node *child, + uint64 child_key) +{ + intset_internal_node *parent; + + Assert(level > 0); + + /* + * Create a new root node, if necessary. + */ + if (level >= intset->num_levels) + { + intset_node *oldroot = intset->root; + uint64 downlink_key; + + /* MAX_TREE_LEVELS should be more than enough, this shouldn't happen */ + if (intset->num_levels == MAX_TREE_LEVELS) + elog(ERROR, "could not expand integer set, maximum number of levels reached"); + intset->num_levels++; + + /* + * Get the first value on the old root page, to be used as the + * downlink. + */ + if (intset->root->level == 0) + downlink_key = ((intset_leaf_node *) oldroot)->items[0].first; + else + downlink_key = ((intset_internal_node *) oldroot)->values[0]; + + parent = intset_new_internal_node(intset); + parent->level = level; + parent->values[0] = downlink_key; + parent->downlinks[0] = oldroot; + parent->num_items = 1; + + intset->root = (intset_node *) parent; + intset->rightmost_nodes[level] = (intset_node *) parent; + } + + /* + * Place the downlink on the parent page. + */ + parent = (intset_internal_node *) intset->rightmost_nodes[level]; + + if (parent->num_items < MAX_INTERNAL_ITEMS) + { + parent->values[parent->num_items] = child_key; + parent->downlinks[parent->num_items] = child; + parent->num_items++; + } + else + { + /* + * Doesn't fit. Allocate new parent, with the downlink as the first + * item on it, and recursively insert the downlink to the new parent + * to the grandparent. + */ + parent = intset_new_internal_node(intset); + parent->level = level; + parent->values[0] = child_key; + parent->downlinks[0] = child; + parent->num_items = 1; + + intset->rightmost_nodes[level] = (intset_node *) parent; + + intset_update_upper(intset, level + 1, (intset_node *) parent, child_key); + } +} + +/* + * Does the set contain the given value? + */ +bool +intset_is_member(IntegerSet *intset, uint64 x) +{ + intset_node *node; + intset_leaf_node *leaf; + int level; + int itemno; + leaf_item *item; + + /* + * The value might be in the buffer of newly-added values. + */ + if (intset->num_buffered_values > 0 && x >= intset->buffered_values[0]) + { + int itemno; + + itemno = intset_binsrch_uint64(x, + intset->buffered_values, + intset->num_buffered_values, + false); + if (itemno >= intset->num_buffered_values) + return false; + else + return (intset->buffered_values[itemno] == x); + } + + /* + * Start from the root, and walk down the B-tree to find the right leaf + * node. + */ + if (!intset->root) + return false; + node = intset->root; + for (level = intset->num_levels - 1; level > 0; level--) + { + intset_internal_node *n = (intset_internal_node *) node; + + Assert(node->level == level); + + itemno = intset_binsrch_uint64(x, n->values, n->num_items, true); + if (itemno == 0) + return false; + node = n->downlinks[itemno - 1]; + } + Assert(node->level == 0); + leaf = (intset_leaf_node *) node; + + /* + * Binary search to find the right item on the leaf page + */ + itemno = intset_binsrch_leaf(x, leaf->items, leaf->num_items, true); + if (itemno == 0) + return false; + item = &leaf->items[itemno - 1]; + + /* Is this a match to the first value on the item? */ + if (item->first == x) + return true; + Assert(x > item->first); + + /* Is it in the packed codeword? */ + if (simple8b_contains(item->codeword, x, item->first)) + return true; + + return false; +} + +/* + * Begin in-order scan through all the values. + * + * While the iteration is in-progress, you cannot add new values to the set. + */ +void +intset_begin_iterate(IntegerSet *intset) +{ + /* Note that we allow an iteration to be abandoned midway */ + intset->iter_active = true; + intset->iter_node = intset->leftmost_leaf; + intset->iter_itemno = 0; + intset->iter_valueno = 0; + intset->iter_num_values = 0; + intset->iter_values = intset->iter_values_buf; +} + +/* + * Returns the next integer, when iterating. + * + * intset_begin_iterate() must be called first. intset_iterate_next() returns + * the next value in the set. Returns true, if there was another value, and + * stores the value in *next. Otherwise, returns false. + */ +bool +intset_iterate_next(IntegerSet *intset, uint64 *next) +{ + Assert(intset->iter_active); + for (;;) + { + /* Return next iter_values[] entry if any */ + if (intset->iter_valueno < intset->iter_num_values) + { + *next = intset->iter_values[intset->iter_valueno++]; + return true; + } + + /* Decode next item in current leaf node, if any */ + if (intset->iter_node && + intset->iter_itemno < intset->iter_node->num_items) + { + leaf_item *item; + int num_decoded; + + item = &intset->iter_node->items[intset->iter_itemno++]; + + intset->iter_values_buf[0] = item->first; + num_decoded = simple8b_decode(item->codeword, + &intset->iter_values_buf[1], + item->first); + intset->iter_num_values = num_decoded + 1; + intset->iter_valueno = 0; + continue; + } + + /* No more items on this leaf, step to next node */ + if (intset->iter_node) + { + intset->iter_node = intset->iter_node->next; + intset->iter_itemno = 0; + continue; + } + + /* + * We have reached the end of the B-tree. But we might still have + * some integers in the buffer of newly-added values. + */ + if (intset->iter_values == (const uint64 *) intset->iter_values_buf) + { + intset->iter_values = intset->buffered_values; + intset->iter_num_values = intset->num_buffered_values; + intset->iter_valueno = 0; + continue; + } + + break; + } + + /* No more results. */ + intset->iter_active = false; + *next = 0; /* prevent uninitialized-variable warnings */ + return false; +} + +/* + * intset_binsrch_uint64() -- search a sorted array of uint64s + * + * Returns the first position with key equal or less than the given key. + * The returned position would be the "insert" location for the given key, + * that is, the position where the new key should be inserted to. + * + * 'nextkey' affects the behavior on equal keys. If true, and there is an + * equal key in the array, this returns the position immediately after the + * equal key. If false, this returns the position of the equal key itself. + */ +static int +intset_binsrch_uint64(uint64 item, uint64 *arr, int arr_elems, bool nextkey) +{ + int low, + high, + mid; + + low = 0; + high = arr_elems; + while (high > low) + { + mid = low + (high - low) / 2; + + if (nextkey) + { + if (item >= arr[mid]) + low = mid + 1; + else + high = mid; + } + else + { + if (item > arr[mid]) + low = mid + 1; + else + high = mid; + } + } + + return low; +} + +/* same, but for an array of leaf items */ +static int +intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey) +{ + int low, + high, + mid; + + low = 0; + high = arr_elems; + while (high > low) + { + mid = low + (high - low) / 2; + + if (nextkey) + { + if (item >= arr[mid].first) + low = mid + 1; + else + high = mid; + } + else + { + if (item > arr[mid].first) + low = mid + 1; + else + high = mid; + } + } + + return low; +} + +/* + * Simple-8b encoding. + * + * The simple-8b algorithm packs between 1 and 240 integers into 64-bit words, + * called "codewords". The number of integers packed into a single codeword + * depends on the integers being packed; small integers are encoded using + * fewer bits than large integers. A single codeword can store a single + * 60-bit integer, or two 30-bit integers, for example. + * + * Since we're storing a unique, sorted, set of integers, we actually encode + * the *differences* between consecutive integers. That way, clusters of + * integers that are close to each other are packed efficiently, regardless + * of their absolute values. + * + * In Simple-8b, each codeword consists of a 4-bit selector, which indicates + * how many integers are encoded in the codeword, and the encoded integers are + * packed into the remaining 60 bits. The selector allows for 16 different + * ways of using the remaining 60 bits, called "modes". The number of integers + * packed into a single codeword in each mode is listed in the simple8b_modes + * table below. For example, consider the following codeword: + * + * 20-bit integer 20-bit integer 20-bit integer + * 1101 00000000000000010010 01111010000100100000 00000000000000010100 + * ^ + * selector + * + * The selector 1101 is 13 in decimal. From the modes table below, we see + * that it means that the codeword encodes three 20-bit integers. In decimal, + * those integers are 18, 500000 and 20. Because we encode deltas rather than + * absolute values, the actual values that they represent are 18, 500018 and + * 500038. + * + * Modes 0 and 1 are a bit special; they encode a run of 240 or 120 zeroes + * (which means 240 or 120 consecutive integers, since we're encoding the + * deltas between integers), without using the rest of the codeword bits + * for anything. + * + * Simple-8b cannot encode integers larger than 60 bits. Values larger than + * that are always stored in the 'first' field of a leaf item, never in the + * packed codeword. If there is a sequence of integers that are more than + * 2^60 apart, the codeword will go unused on those items. To represent that, + * we use a magic EMPTY_CODEWORD codeword value. + */ +static const struct simple8b_mode +{ + uint8 bits_per_int; + uint8 num_ints; +} simple8b_modes[17] = + +{ + {0, 240}, /* mode 0: 240 zeroes */ + {0, 120}, /* mode 1: 120 zeroes */ + {1, 60}, /* mode 2: sixty 1-bit integers */ + {2, 30}, /* mode 3: thirty 2-bit integers */ + {3, 20}, /* mode 4: twenty 3-bit integers */ + {4, 15}, /* mode 5: fifteen 4-bit integers */ + {5, 12}, /* mode 6: twelve 5-bit integers */ + {6, 10}, /* mode 7: ten 6-bit integers */ + {7, 8}, /* mode 8: eight 7-bit integers (four bits + * are wasted) */ + {8, 7}, /* mode 9: seven 8-bit integers (four bits + * are wasted) */ + {10, 6}, /* mode 10: six 10-bit integers */ + {12, 5}, /* mode 11: five 12-bit integers */ + {15, 4}, /* mode 12: four 15-bit integers */ + {20, 3}, /* mode 13: three 20-bit integers */ + {30, 2}, /* mode 14: two 30-bit integers */ + {60, 1}, /* mode 15: one 60-bit integer */ + + {0, 0} /* sentinel value */ +}; + +/* + * EMPTY_CODEWORD is a special value, used to indicate "no values". + * It is used if the next value is too large to be encoded with Simple-8b. + * + * This value looks like a mode-0 codeword, but we can distinguish it + * because a regular mode-0 codeword would have zeroes in the unused bits. + */ +#define EMPTY_CODEWORD UINT64CONST(0x0FFFFFFFFFFFFFFF) + +/* + * Encode a number of integers into a Simple-8b codeword. + * + * (What we actually encode are deltas between successive integers. + * "base" is the value before ints[0].) + * + * The input array must contain at least SIMPLE8B_MAX_VALUES_PER_CODEWORD + * elements, ensuring that we can produce a full codeword. + * + * Returns the encoded codeword, and sets *num_encoded to the number of + * input integers that were encoded. That can be zero, if the first delta + * is too large to be encoded. + */ +static uint64 +simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base) +{ + int selector; + int nints; + int bits; + uint64 diff; + uint64 last_val; + uint64 codeword; + int i; + + Assert(ints[0] > base); + + /* + * Select the "mode" to use for this codeword. + * + * In each iteration, check if the next value can be represented in the + * current mode we're considering. If it's too large, then step up the + * mode to a wider one, and repeat. If it fits, move on to the next + * integer. Repeat until the codeword is full, given the current mode. + * + * Note that we don't have any way to represent unused slots in the + * codeword, so we require each codeword to be "full". It is always + * possible to produce a full codeword unless the very first delta is too + * large to be encoded. For example, if the first delta is small but the + * second is too large to be encoded, we'll end up using the last "mode", + * which has nints == 1. + */ + selector = 0; + nints = simple8b_modes[0].num_ints; + bits = simple8b_modes[0].bits_per_int; + diff = ints[0] - base - 1; + last_val = ints[0]; + i = 0; /* number of deltas we have accepted */ + for (;;) + { + if (diff >= (UINT64CONST(1) << bits)) + { + /* too large, step up to next mode */ + selector++; + nints = simple8b_modes[selector].num_ints; + bits = simple8b_modes[selector].bits_per_int; + /* we might already have accepted enough deltas for this mode */ + if (i >= nints) + break; + } + else + { + /* accept this delta; then done if codeword is full */ + i++; + if (i >= nints) + break; + /* examine next delta */ + Assert(ints[i] > last_val); + diff = ints[i] - last_val - 1; + last_val = ints[i]; + } + } + + if (nints == 0) + { + /* + * The first delta is too large to be encoded with Simple-8b. + * + * If there is at least one not-too-large integer in the input, we + * will encode it using mode 15 (or a more compact mode). Hence, we + * can only get here if the *first* delta is >= 2^60. + */ + Assert(i == 0); + *num_encoded = 0; + return EMPTY_CODEWORD; + } + + /* + * Encode the integers using the selected mode. Note that we shift them + * into the codeword in reverse order, so that they will come out in the + * correct order in the decoder. + */ + codeword = 0; + if (bits > 0) + { + for (i = nints - 1; i > 0; i--) + { + diff = ints[i] - ints[i - 1] - 1; + codeword |= diff; + codeword <<= bits; + } + diff = ints[0] - base - 1; + codeword |= diff; + } + + /* add selector to the codeword, and return */ + codeword |= (uint64) selector << 60; + + *num_encoded = nints; + return codeword; +} + +/* + * Decode a codeword into an array of integers. + * Returns the number of integers decoded. + */ +static int +simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base) +{ + int selector = (codeword >> 60); + int nints = simple8b_modes[selector].num_ints; + int bits = simple8b_modes[selector].bits_per_int; + uint64 mask = (UINT64CONST(1) << bits) - 1; + uint64 curr_value; + + if (codeword == EMPTY_CODEWORD) + return 0; + + curr_value = base; + for (int i = 0; i < nints; i++) + { + uint64 diff = codeword & mask; + + curr_value += 1 + diff; + decoded[i] = curr_value; + codeword >>= bits; + } + return nints; +} + +/* + * This is very similar to simple8b_decode(), but instead of decoding all + * the values to an array, it just checks if the given "key" is part of + * the codeword. + */ +static bool +simple8b_contains(uint64 codeword, uint64 key, uint64 base) +{ + int selector = (codeword >> 60); + int nints = simple8b_modes[selector].num_ints; + int bits = simple8b_modes[selector].bits_per_int; + + if (codeword == EMPTY_CODEWORD) + return false; + + if (bits == 0) + { + /* Special handling for 0-bit cases. */ + return (key - base) <= nints; + } + else + { + uint64 mask = (UINT64CONST(1) << bits) - 1; + uint64 curr_value; + + curr_value = base; + for (int i = 0; i < nints; i++) + { + uint64 diff = codeword & mask; + + curr_value += 1 + diff; + + if (curr_value >= key) + { + if (curr_value == key) + return true; + else + return false; + } + + codeword >>= bits; + } + } + return false; +} diff --git a/src/backend/lib/knapsack.c b/src/backend/lib/knapsack.c new file mode 100644 index 0000000..50c84b4 --- /dev/null +++ b/src/backend/lib/knapsack.c @@ -0,0 +1,111 @@ +/*------------------------------------------------------------------------- + * + * knapsack.c + * Knapsack problem solver + * + * Given input vectors of integral item weights (must be >= 0) and values + * (double >= 0), compute the set of items which produces the greatest total + * value without exceeding a specified total weight; each item is included at + * most once (this is the 0/1 knapsack problem). Weight 0 items will always be + * included. + * + * The performance of this algorithm is pseudo-polynomial, O(nW) where W is the + * weight limit. To use with non-integral weights or approximate solutions, + * the caller should pre-scale the input weights to a suitable range. This + * allows approximate solutions in polynomial time (the general case of the + * exact problem is NP-hard). + * + * Copyright (c) 2017-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/knapsack.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> +#include <limits.h> + +#include "lib/knapsack.h" +#include "miscadmin.h" +#include "nodes/bitmapset.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + +/* + * DiscreteKnapsack + * + * The item_values input is optional; if omitted, all the items are assumed to + * have value 1. + * + * Returns a Bitmapset of the 0..(n-1) indexes of the items chosen for + * inclusion in the solution. + * + * This uses the usual dynamic-programming algorithm, adapted to reuse the + * memory on each pass (by working from larger weights to smaller). At the + * start of pass number i, the values[w] array contains the largest value + * computed with total weight <= w, using only items with indices < i; and + * sets[w] contains the bitmap of items actually used for that value. (The + * bitmapsets are all pre-initialized with an unused high bit so that memory + * allocation is done only once.) + */ +Bitmapset * +DiscreteKnapsack(int max_weight, int num_items, + int *item_weights, double *item_values) +{ + MemoryContext local_ctx = AllocSetContextCreate(CurrentMemoryContext, + "Knapsack", + ALLOCSET_SMALL_SIZES); + MemoryContext oldctx = MemoryContextSwitchTo(local_ctx); + double *values; + Bitmapset **sets; + Bitmapset *result; + int i, + j; + + Assert(max_weight >= 0); + Assert(num_items > 0 && item_weights); + + values = palloc((1 + max_weight) * sizeof(double)); + sets = palloc((1 + max_weight) * sizeof(Bitmapset *)); + + for (i = 0; i <= max_weight; ++i) + { + values[i] = 0; + sets[i] = bms_make_singleton(num_items); + } + + for (i = 0; i < num_items; ++i) + { + int iw = item_weights[i]; + double iv = item_values ? item_values[i] : 1; + + for (j = max_weight; j >= iw; --j) + { + int ow = j - iw; + + if (values[j] <= values[ow] + iv) + { + /* copy sets[ow] to sets[j] without realloc */ + if (j != ow) + { + sets[j] = bms_del_members(sets[j], sets[j]); + sets[j] = bms_add_members(sets[j], sets[ow]); + } + + sets[j] = bms_add_member(sets[j], i); + + values[j] = values[ow] + iv; + } + } + } + + MemoryContextSwitchTo(oldctx); + + result = bms_del_member(bms_copy(sets[max_weight]), num_items); + + MemoryContextDelete(local_ctx); + + return result; +} diff --git a/src/backend/lib/pairingheap.c b/src/backend/lib/pairingheap.c new file mode 100644 index 0000000..bed3d2e --- /dev/null +++ b/src/backend/lib/pairingheap.c @@ -0,0 +1,333 @@ +/*------------------------------------------------------------------------- + * + * pairingheap.c + * A Pairing Heap implementation + * + * A pairing heap is a data structure that's useful for implementing + * priority queues. It is simple to implement, and provides amortized O(1) + * insert and find-min operations, and amortized O(log n) delete-min. + * + * The pairing heap was first described in this paper: + * + * Michael L. Fredman, Robert Sedgewick, Daniel D. Sleator, and Robert E. + * Tarjan. 1986. + * The pairing heap: a new form of self-adjusting heap. + * Algorithmica 1, 1 (January 1986), pages 111-129. DOI: 10.1007/BF01840439 + * + * Portions Copyright (c) 2012-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/pairingheap.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "lib/pairingheap.h" + +static pairingheap_node *merge(pairingheap *heap, pairingheap_node *a, + pairingheap_node *b); +static pairingheap_node *merge_children(pairingheap *heap, + pairingheap_node *children); + +/* + * pairingheap_allocate + * + * Returns a pointer to a newly-allocated heap, with the heap property defined + * by the given comparator function, which will be invoked with the additional + * argument specified by 'arg'. + */ +pairingheap * +pairingheap_allocate(pairingheap_comparator compare, void *arg) +{ + pairingheap *heap; + + heap = (pairingheap *) palloc(sizeof(pairingheap)); + heap->ph_compare = compare; + heap->ph_arg = arg; + + heap->ph_root = NULL; + + return heap; +} + +/* + * pairingheap_free + * + * Releases memory used by the given pairingheap. + * + * Note: The nodes in the heap are not freed! + */ +void +pairingheap_free(pairingheap *heap) +{ + pfree(heap); +} + +/* + * A helper function to merge two subheaps into one. + * + * The subheap with smaller value is put as a child of the other one (assuming + * a max-heap). + * + * The next_sibling and prev_or_parent pointers of the input nodes are + * ignored. On return, the returned node's next_sibling and prev_or_parent + * pointers are garbage. + */ +static pairingheap_node * +merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b) +{ + if (a == NULL) + return b; + if (b == NULL) + return a; + + /* swap 'a' and 'b' so that 'a' is the one with larger value */ + if (heap->ph_compare(a, b, heap->ph_arg) < 0) + { + pairingheap_node *tmp; + + tmp = a; + a = b; + b = tmp; + } + + /* and put 'b' as a child of 'a' */ + if (a->first_child) + a->first_child->prev_or_parent = b; + b->prev_or_parent = a; + b->next_sibling = a->first_child; + a->first_child = b; + + return a; +} + +/* + * pairingheap_add + * + * Adds the given node to the heap in O(1) time. + */ +void +pairingheap_add(pairingheap *heap, pairingheap_node *node) +{ + node->first_child = NULL; + + /* Link the new node as a new tree */ + heap->ph_root = merge(heap, heap->ph_root, node); + heap->ph_root->prev_or_parent = NULL; + heap->ph_root->next_sibling = NULL; +} + +/* + * pairingheap_first + * + * Returns a pointer to the first (root, topmost) node in the heap without + * modifying the heap. The caller must ensure that this routine is not used on + * an empty heap. Always O(1). + */ +pairingheap_node * +pairingheap_first(pairingheap *heap) +{ + Assert(!pairingheap_is_empty(heap)); + + return heap->ph_root; +} + +/* + * pairingheap_remove_first + * + * Removes the first (root, topmost) node in the heap and returns a pointer to + * it after rebalancing the heap. The caller must ensure that this routine is + * not used on an empty heap. O(log n) amortized. + */ +pairingheap_node * +pairingheap_remove_first(pairingheap *heap) +{ + pairingheap_node *result; + pairingheap_node *children; + + Assert(!pairingheap_is_empty(heap)); + + /* Remove the root, and form a new heap of its children. */ + result = heap->ph_root; + children = result->first_child; + + heap->ph_root = merge_children(heap, children); + if (heap->ph_root) + { + heap->ph_root->prev_or_parent = NULL; + heap->ph_root->next_sibling = NULL; + } + + return result; +} + +/* + * Remove 'node' from the heap. O(log n) amortized. + */ +void +pairingheap_remove(pairingheap *heap, pairingheap_node *node) +{ + pairingheap_node *children; + pairingheap_node *replacement; + pairingheap_node *next_sibling; + pairingheap_node **prev_ptr; + + /* + * If the removed node happens to be the root node, do it with + * pairingheap_remove_first(). + */ + if (node == heap->ph_root) + { + (void) pairingheap_remove_first(heap); + return; + } + + /* + * Before we modify anything, remember the removed node's first_child and + * next_sibling pointers. + */ + children = node->first_child; + next_sibling = node->next_sibling; + + /* + * Also find the pointer to the removed node in its previous sibling, or + * if this is the first child of its parent, in its parent. + */ + if (node->prev_or_parent->first_child == node) + prev_ptr = &node->prev_or_parent->first_child; + else + prev_ptr = &node->prev_or_parent->next_sibling; + Assert(*prev_ptr == node); + + /* + * If this node has children, make a new subheap of the children and link + * the subheap in place of the removed node. Otherwise just unlink this + * node. + */ + if (children) + { + replacement = merge_children(heap, children); + + replacement->prev_or_parent = node->prev_or_parent; + replacement->next_sibling = node->next_sibling; + *prev_ptr = replacement; + if (next_sibling) + next_sibling->prev_or_parent = replacement; + } + else + { + *prev_ptr = next_sibling; + if (next_sibling) + next_sibling->prev_or_parent = node->prev_or_parent; + } +} + +/* + * Merge a list of subheaps into a single heap. + * + * This implements the basic two-pass merging strategy, first forming pairs + * from left to right, and then merging the pairs. + */ +static pairingheap_node * +merge_children(pairingheap *heap, pairingheap_node *children) +{ + pairingheap_node *curr, + *next; + pairingheap_node *pairs; + pairingheap_node *newroot; + + if (children == NULL || children->next_sibling == NULL) + return children; + + /* Walk the subheaps from left to right, merging in pairs */ + next = children; + pairs = NULL; + for (;;) + { + curr = next; + + if (curr == NULL) + break; + + if (curr->next_sibling == NULL) + { + /* last odd node at the end of list */ + curr->next_sibling = pairs; + pairs = curr; + break; + } + + next = curr->next_sibling->next_sibling; + + /* merge this and the next subheap, and add to 'pairs' list. */ + + curr = merge(heap, curr, curr->next_sibling); + curr->next_sibling = pairs; + pairs = curr; + } + + /* + * Merge all the pairs together to form a single heap. + */ + newroot = pairs; + next = pairs->next_sibling; + while (next) + { + curr = next; + next = curr->next_sibling; + + newroot = merge(heap, newroot, curr); + } + + return newroot; +} + +/* + * A debug function to dump the contents of the heap as a string. + * + * The 'dumpfunc' callback appends a string representation of a single node + * to the StringInfo. 'opaque' can be used to pass more information to the + * callback. + */ +#ifdef PAIRINGHEAP_DEBUG +static void +pairingheap_dump_recurse(StringInfo buf, + pairingheap_node *node, + void (*dumpfunc) (pairingheap_node *node, StringInfo buf, void *opaque), + void *opaque, + int depth, + pairingheap_node *prev_or_parent) +{ + while (node) + { + Assert(node->prev_or_parent == prev_or_parent); + + appendStringInfoSpaces(buf, depth * 4); + dumpfunc(node, buf, opaque); + appendStringInfoChar(buf, '\n'); + if (node->first_child) + pairingheap_dump_recurse(buf, node->first_child, dumpfunc, opaque, depth + 1, node); + prev_or_parent = node; + node = node->next_sibling; + } +} + +char * +pairingheap_dump(pairingheap *heap, + void (*dumpfunc) (pairingheap_node *node, StringInfo buf, void *opaque), + void *opaque) +{ + StringInfoData buf; + + if (!heap->ph_root) + return pstrdup("(empty)"); + + initStringInfo(&buf); + + pairingheap_dump_recurse(&buf, heap->ph_root, dumpfunc, opaque, 0, NULL); + + return buf.data; +} +#endif diff --git a/src/backend/lib/rbtree.c b/src/backend/lib/rbtree.c new file mode 100644 index 0000000..536df1f --- /dev/null +++ b/src/backend/lib/rbtree.c @@ -0,0 +1,770 @@ +/*------------------------------------------------------------------------- + * + * rbtree.c + * implementation for PostgreSQL generic Red-Black binary tree package + * Adopted from http://algolist.manual.ru/ds/rbtree.php + * + * This code comes from Thomas Niemann's "Sorting and Searching Algorithms: + * a Cookbook". + * + * See http://www.cs.auckland.ac.nz/software/AlgAnim/niemann/s_man.htm for + * license terms: "Source code, when part of a software project, may be used + * freely without reference to the author." + * + * Red-black trees are a type of balanced binary tree wherein (1) any child of + * a red node is always black, and (2) every path from root to leaf traverses + * an equal number of black nodes. From these properties, it follows that the + * longest path from root to leaf is only about twice as long as the shortest, + * so lookups are guaranteed to run in O(lg n) time. + * + * Copyright (c) 2009-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/rbtree.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "lib/rbtree.h" + + +/* + * Colors of nodes (values of RBTNode.color) + */ +#define RBTBLACK (0) +#define RBTRED (1) + +/* + * RBTree control structure + */ +struct RBTree +{ + RBTNode *root; /* root node, or RBTNIL if tree is empty */ + + /* Remaining fields are constant after rbt_create */ + + Size node_size; /* actual size of tree nodes */ + /* The caller-supplied manipulation functions */ + rbt_comparator comparator; + rbt_combiner combiner; + rbt_allocfunc allocfunc; + rbt_freefunc freefunc; + /* Passthrough arg passed to all manipulation functions */ + void *arg; +}; + +/* + * all leafs are sentinels, use customized NIL name to prevent + * collision with system-wide constant NIL which is actually NULL + */ +#define RBTNIL (&sentinel) + +static RBTNode sentinel = +{ + RBTBLACK, RBTNIL, RBTNIL, NULL +}; + + +/* + * rbt_create: create an empty RBTree + * + * Arguments are: + * node_size: actual size of tree nodes (> sizeof(RBTNode)) + * The manipulation functions: + * comparator: compare two RBTNodes for less/equal/greater + * combiner: merge an existing tree entry with a new one + * allocfunc: allocate a new RBTNode + * freefunc: free an old RBTNode + * arg: passthrough pointer that will be passed to the manipulation functions + * + * Note that the combiner's righthand argument will be a "proposed" tree node, + * ie the input to rbt_insert, in which the RBTNode fields themselves aren't + * valid. Similarly, either input to the comparator may be a "proposed" node. + * This shouldn't matter since the functions aren't supposed to look at the + * RBTNode fields, only the extra fields of the struct the RBTNode is embedded + * in. + * + * The freefunc should just be pfree or equivalent; it should NOT attempt + * to free any subsidiary data, because the node passed to it may not contain + * valid data! freefunc can be NULL if caller doesn't require retail + * space reclamation. + * + * The RBTree node is palloc'd in the caller's memory context. Note that + * all contents of the tree are actually allocated by the caller, not here. + * + * Since tree contents are managed by the caller, there is currently not + * an explicit "destroy" operation; typically a tree would be freed by + * resetting or deleting the memory context it's stored in. You can pfree + * the RBTree node if you feel the urge. + */ +RBTree * +rbt_create(Size node_size, + rbt_comparator comparator, + rbt_combiner combiner, + rbt_allocfunc allocfunc, + rbt_freefunc freefunc, + void *arg) +{ + RBTree *tree = (RBTree *) palloc(sizeof(RBTree)); + + Assert(node_size > sizeof(RBTNode)); + + tree->root = RBTNIL; + tree->node_size = node_size; + tree->comparator = comparator; + tree->combiner = combiner; + tree->allocfunc = allocfunc; + tree->freefunc = freefunc; + + tree->arg = arg; + + return tree; +} + +/* Copy the additional data fields from one RBTNode to another */ +static inline void +rbt_copy_data(RBTree *rbt, RBTNode *dest, const RBTNode *src) +{ + memcpy(dest + 1, src + 1, rbt->node_size - sizeof(RBTNode)); +} + +/********************************************************************** + * Search * + **********************************************************************/ + +/* + * rbt_find: search for a value in an RBTree + * + * data represents the value to try to find. Its RBTNode fields need not + * be valid, it's the extra data in the larger struct that is of interest. + * + * Returns the matching tree entry, or NULL if no match is found. + */ +RBTNode * +rbt_find(RBTree *rbt, const RBTNode *data) +{ + RBTNode *node = rbt->root; + + while (node != RBTNIL) + { + int cmp = rbt->comparator(data, node, rbt->arg); + + if (cmp == 0) + return node; + else if (cmp < 0) + node = node->left; + else + node = node->right; + } + + return NULL; +} + +/* + * rbt_leftmost: fetch the leftmost (smallest-valued) tree node. + * Returns NULL if tree is empty. + * + * Note: in the original implementation this included an unlink step, but + * that's a bit awkward. Just call rbt_delete on the result if that's what + * you want. + */ +RBTNode * +rbt_leftmost(RBTree *rbt) +{ + RBTNode *node = rbt->root; + RBTNode *leftmost = rbt->root; + + while (node != RBTNIL) + { + leftmost = node; + node = node->left; + } + + if (leftmost != RBTNIL) + return leftmost; + + return NULL; +} + +/********************************************************************** + * Insertion * + **********************************************************************/ + +/* + * Rotate node x to left. + * + * x's right child takes its place in the tree, and x becomes the left + * child of that node. + */ +static void +rbt_rotate_left(RBTree *rbt, RBTNode *x) +{ + RBTNode *y = x->right; + + /* establish x->right link */ + x->right = y->left; + if (y->left != RBTNIL) + y->left->parent = x; + + /* establish y->parent link */ + if (y != RBTNIL) + y->parent = x->parent; + if (x->parent) + { + if (x == x->parent->left) + x->parent->left = y; + else + x->parent->right = y; + } + else + { + rbt->root = y; + } + + /* link x and y */ + y->left = x; + if (x != RBTNIL) + x->parent = y; +} + +/* + * Rotate node x to right. + * + * x's left right child takes its place in the tree, and x becomes the right + * child of that node. + */ +static void +rbt_rotate_right(RBTree *rbt, RBTNode *x) +{ + RBTNode *y = x->left; + + /* establish x->left link */ + x->left = y->right; + if (y->right != RBTNIL) + y->right->parent = x; + + /* establish y->parent link */ + if (y != RBTNIL) + y->parent = x->parent; + if (x->parent) + { + if (x == x->parent->right) + x->parent->right = y; + else + x->parent->left = y; + } + else + { + rbt->root = y; + } + + /* link x and y */ + y->right = x; + if (x != RBTNIL) + x->parent = y; +} + +/* + * Maintain Red-Black tree balance after inserting node x. + * + * The newly inserted node is always initially marked red. That may lead to + * a situation where a red node has a red child, which is prohibited. We can + * always fix the problem by a series of color changes and/or "rotations", + * which move the problem progressively higher up in the tree. If one of the + * two red nodes is the root, we can always fix the problem by changing the + * root from red to black. + * + * (This does not work lower down in the tree because we must also maintain + * the invariant that every leaf has equal black-height.) + */ +static void +rbt_insert_fixup(RBTree *rbt, RBTNode *x) +{ + /* + * x is always a red node. Initially, it is the newly inserted node. Each + * iteration of this loop moves it higher up in the tree. + */ + while (x != rbt->root && x->parent->color == RBTRED) + { + /* + * x and x->parent are both red. Fix depends on whether x->parent is + * a left or right child. In either case, we define y to be the + * "uncle" of x, that is, the other child of x's grandparent. + * + * If the uncle is red, we flip the grandparent to red and its two + * children to black. Then we loop around again to check whether the + * grandparent still has a problem. + * + * If the uncle is black, we will perform one or two "rotations" to + * balance the tree. Either x or x->parent will take the + * grandparent's position in the tree and recolored black, and the + * original grandparent will be recolored red and become a child of + * that node. This always leaves us with a valid red-black tree, so + * the loop will terminate. + */ + if (x->parent == x->parent->parent->left) + { + RBTNode *y = x->parent->parent->right; + + if (y->color == RBTRED) + { + /* uncle is RBTRED */ + x->parent->color = RBTBLACK; + y->color = RBTBLACK; + x->parent->parent->color = RBTRED; + + x = x->parent->parent; + } + else + { + /* uncle is RBTBLACK */ + if (x == x->parent->right) + { + /* make x a left child */ + x = x->parent; + rbt_rotate_left(rbt, x); + } + + /* recolor and rotate */ + x->parent->color = RBTBLACK; + x->parent->parent->color = RBTRED; + + rbt_rotate_right(rbt, x->parent->parent); + } + } + else + { + /* mirror image of above code */ + RBTNode *y = x->parent->parent->left; + + if (y->color == RBTRED) + { + /* uncle is RBTRED */ + x->parent->color = RBTBLACK; + y->color = RBTBLACK; + x->parent->parent->color = RBTRED; + + x = x->parent->parent; + } + else + { + /* uncle is RBTBLACK */ + if (x == x->parent->left) + { + x = x->parent; + rbt_rotate_right(rbt, x); + } + x->parent->color = RBTBLACK; + x->parent->parent->color = RBTRED; + + rbt_rotate_left(rbt, x->parent->parent); + } + } + } + + /* + * The root may already have been black; if not, the black-height of every + * node in the tree increases by one. + */ + rbt->root->color = RBTBLACK; +} + +/* + * rbt_insert: insert a new value into the tree. + * + * data represents the value to insert. Its RBTNode fields need not + * be valid, it's the extra data in the larger struct that is of interest. + * + * If the value represented by "data" is not present in the tree, then + * we copy "data" into a new tree entry and return that node, setting *isNew + * to true. + * + * If the value represented by "data" is already present, then we call the + * combiner function to merge data into the existing node, and return the + * existing node, setting *isNew to false. + * + * "data" is unmodified in either case; it's typically just a local + * variable in the caller. + */ +RBTNode * +rbt_insert(RBTree *rbt, const RBTNode *data, bool *isNew) +{ + RBTNode *current, + *parent, + *x; + int cmp; + + /* find where node belongs */ + current = rbt->root; + parent = NULL; + cmp = 0; /* just to prevent compiler warning */ + + while (current != RBTNIL) + { + cmp = rbt->comparator(data, current, rbt->arg); + if (cmp == 0) + { + /* + * Found node with given key. Apply combiner. + */ + rbt->combiner(current, data, rbt->arg); + *isNew = false; + return current; + } + parent = current; + current = (cmp < 0) ? current->left : current->right; + } + + /* + * Value is not present, so create a new node containing data. + */ + *isNew = true; + + x = rbt->allocfunc(rbt->arg); + + x->color = RBTRED; + + x->left = RBTNIL; + x->right = RBTNIL; + x->parent = parent; + rbt_copy_data(rbt, x, data); + + /* insert node in tree */ + if (parent) + { + if (cmp < 0) + parent->left = x; + else + parent->right = x; + } + else + { + rbt->root = x; + } + + rbt_insert_fixup(rbt, x); + + return x; +} + +/********************************************************************** + * Deletion * + **********************************************************************/ + +/* + * Maintain Red-Black tree balance after deleting a black node. + */ +static void +rbt_delete_fixup(RBTree *rbt, RBTNode *x) +{ + /* + * x is always a black node. Initially, it is the former child of the + * deleted node. Each iteration of this loop moves it higher up in the + * tree. + */ + while (x != rbt->root && x->color == RBTBLACK) + { + /* + * Left and right cases are symmetric. Any nodes that are children of + * x have a black-height one less than the remainder of the nodes in + * the tree. We rotate and recolor nodes to move the problem up the + * tree: at some stage we'll either fix the problem, or reach the root + * (where the black-height is allowed to decrease). + */ + if (x == x->parent->left) + { + RBTNode *w = x->parent->right; + + if (w->color == RBTRED) + { + w->color = RBTBLACK; + x->parent->color = RBTRED; + + rbt_rotate_left(rbt, x->parent); + w = x->parent->right; + } + + if (w->left->color == RBTBLACK && w->right->color == RBTBLACK) + { + w->color = RBTRED; + + x = x->parent; + } + else + { + if (w->right->color == RBTBLACK) + { + w->left->color = RBTBLACK; + w->color = RBTRED; + + rbt_rotate_right(rbt, w); + w = x->parent->right; + } + w->color = x->parent->color; + x->parent->color = RBTBLACK; + w->right->color = RBTBLACK; + + rbt_rotate_left(rbt, x->parent); + x = rbt->root; /* Arrange for loop to terminate. */ + } + } + else + { + RBTNode *w = x->parent->left; + + if (w->color == RBTRED) + { + w->color = RBTBLACK; + x->parent->color = RBTRED; + + rbt_rotate_right(rbt, x->parent); + w = x->parent->left; + } + + if (w->right->color == RBTBLACK && w->left->color == RBTBLACK) + { + w->color = RBTRED; + + x = x->parent; + } + else + { + if (w->left->color == RBTBLACK) + { + w->right->color = RBTBLACK; + w->color = RBTRED; + + rbt_rotate_left(rbt, w); + w = x->parent->left; + } + w->color = x->parent->color; + x->parent->color = RBTBLACK; + w->left->color = RBTBLACK; + + rbt_rotate_right(rbt, x->parent); + x = rbt->root; /* Arrange for loop to terminate. */ + } + } + } + x->color = RBTBLACK; +} + +/* + * Delete node z from tree. + */ +static void +rbt_delete_node(RBTree *rbt, RBTNode *z) +{ + RBTNode *x, + *y; + + /* This is just paranoia: we should only get called on a valid node */ + if (!z || z == RBTNIL) + return; + + /* + * y is the node that will actually be removed from the tree. This will + * be z if z has fewer than two children, or the tree successor of z + * otherwise. + */ + if (z->left == RBTNIL || z->right == RBTNIL) + { + /* y has a RBTNIL node as a child */ + y = z; + } + else + { + /* find tree successor */ + y = z->right; + while (y->left != RBTNIL) + y = y->left; + } + + /* x is y's only child */ + if (y->left != RBTNIL) + x = y->left; + else + x = y->right; + + /* Remove y from the tree. */ + x->parent = y->parent; + if (y->parent) + { + if (y == y->parent->left) + y->parent->left = x; + else + y->parent->right = x; + } + else + { + rbt->root = x; + } + + /* + * If we removed the tree successor of z rather than z itself, then move + * the data for the removed node to the one we were supposed to remove. + */ + if (y != z) + rbt_copy_data(rbt, z, y); + + /* + * Removing a black node might make some paths from root to leaf contain + * fewer black nodes than others, or it might make two red nodes adjacent. + */ + if (y->color == RBTBLACK) + rbt_delete_fixup(rbt, x); + + /* Now we can recycle the y node */ + if (rbt->freefunc) + rbt->freefunc(y, rbt->arg); +} + +/* + * rbt_delete: remove the given tree entry + * + * "node" must have previously been found via rbt_find or rbt_leftmost. + * It is caller's responsibility to free any subsidiary data attached + * to the node before calling rbt_delete. (Do *not* try to push that + * responsibility off to the freefunc, as some other physical node + * may be the one actually freed!) + */ +void +rbt_delete(RBTree *rbt, RBTNode *node) +{ + rbt_delete_node(rbt, node); +} + +/********************************************************************** + * Traverse * + **********************************************************************/ + +static RBTNode * +rbt_left_right_iterator(RBTreeIterator *iter) +{ + if (iter->last_visited == NULL) + { + iter->last_visited = iter->rbt->root; + while (iter->last_visited->left != RBTNIL) + iter->last_visited = iter->last_visited->left; + + return iter->last_visited; + } + + if (iter->last_visited->right != RBTNIL) + { + iter->last_visited = iter->last_visited->right; + while (iter->last_visited->left != RBTNIL) + iter->last_visited = iter->last_visited->left; + + return iter->last_visited; + } + + for (;;) + { + RBTNode *came_from = iter->last_visited; + + iter->last_visited = iter->last_visited->parent; + if (iter->last_visited == NULL) + { + iter->is_over = true; + break; + } + + if (iter->last_visited->left == came_from) + break; /* came from left sub-tree, return current + * node */ + + /* else - came from right sub-tree, continue to move up */ + } + + return iter->last_visited; +} + +static RBTNode * +rbt_right_left_iterator(RBTreeIterator *iter) +{ + if (iter->last_visited == NULL) + { + iter->last_visited = iter->rbt->root; + while (iter->last_visited->right != RBTNIL) + iter->last_visited = iter->last_visited->right; + + return iter->last_visited; + } + + if (iter->last_visited->left != RBTNIL) + { + iter->last_visited = iter->last_visited->left; + while (iter->last_visited->right != RBTNIL) + iter->last_visited = iter->last_visited->right; + + return iter->last_visited; + } + + for (;;) + { + RBTNode *came_from = iter->last_visited; + + iter->last_visited = iter->last_visited->parent; + if (iter->last_visited == NULL) + { + iter->is_over = true; + break; + } + + if (iter->last_visited->right == came_from) + break; /* came from right sub-tree, return current + * node */ + + /* else - came from left sub-tree, continue to move up */ + } + + return iter->last_visited; +} + +/* + * rbt_begin_iterate: prepare to traverse the tree in any of several orders + * + * After calling rbt_begin_iterate, call rbt_iterate repeatedly until it + * returns NULL or the traversal stops being of interest. + * + * If the tree is changed during traversal, results of further calls to + * rbt_iterate are unspecified. Multiple concurrent iterators on the same + * tree are allowed. + * + * The iterator state is stored in the 'iter' struct. The caller should + * treat it as an opaque struct. + */ +void +rbt_begin_iterate(RBTree *rbt, RBTOrderControl ctrl, RBTreeIterator *iter) +{ + /* Common initialization for all traversal orders */ + iter->rbt = rbt; + iter->last_visited = NULL; + iter->is_over = (rbt->root == RBTNIL); + + switch (ctrl) + { + case LeftRightWalk: /* visit left, then self, then right */ + iter->iterate = rbt_left_right_iterator; + break; + case RightLeftWalk: /* visit right, then self, then left */ + iter->iterate = rbt_right_left_iterator; + break; + default: + elog(ERROR, "unrecognized rbtree iteration order: %d", ctrl); + } +} + +/* + * rbt_iterate: return the next node in traversal order, or NULL if no more + */ +RBTNode * +rbt_iterate(RBTreeIterator *iter) +{ + if (iter->is_over) + return NULL; + + return iter->iterate(iter); +} |