summaryrefslogtreecommitdiffstats
path: root/src/backend/utils/hash/dynahash.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 13:44:03 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 13:44:03 +0000
commit293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch)
treefc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/utils/hash/dynahash.c
parentInitial commit. (diff)
downloadpostgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz
postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip
Adding upstream version 16.2.upstream/16.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/hash/dynahash.c')
-rw-r--r--src/backend/utils/hash/dynahash.c1925
1 files changed, 1925 insertions, 0 deletions
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
new file mode 100644
index 0000000..012d4a0
--- /dev/null
+++ b/src/backend/utils/hash/dynahash.c
@@ -0,0 +1,1925 @@
+/*-------------------------------------------------------------------------
+ *
+ * dynahash.c
+ * dynamic chained hash tables
+ *
+ * dynahash.c supports both local-to-a-backend hash tables and hash tables in
+ * shared memory. For shared hash tables, it is the caller's responsibility
+ * to provide appropriate access interlocking. The simplest convention is
+ * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
+ * hash_seq_search) need only shared lock, but any update requires exclusive
+ * lock. For heavily-used shared tables, the single-lock approach creates a
+ * concurrency bottleneck, so we also support "partitioned" locking wherein
+ * there are multiple LWLocks guarding distinct subsets of the table. To use
+ * a hash table in partitioned mode, the HASH_PARTITION flag must be given
+ * to hash_create. This prevents any attempt to split buckets on-the-fly.
+ * Therefore, each hash bucket chain operates independently, and no fields
+ * of the hash header change after init except nentries and freeList.
+ * (A partitioned table uses multiple copies of those fields, guarded by
+ * spinlocks, for additional concurrency.)
+ * This lets any subset of the hash buckets be treated as a separately
+ * lockable partition. We expect callers to use the low-order bits of a
+ * lookup key's hash value as a partition number --- this will work because
+ * of the way calc_bucket() maps hash values to bucket numbers.
+ *
+ * For hash tables in shared memory, the memory allocator function should
+ * match malloc's semantics of returning NULL on failure. For hash tables
+ * in local memory, we typically use palloc() which will throw error on
+ * failure. The code in this file has to cope with both cases.
+ *
+ * dynahash.c provides support for these types of lookup keys:
+ *
+ * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
+ * compared as though by strcmp(). This is selected by specifying the
+ * HASH_STRINGS flag to hash_create.
+ *
+ * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
+ * (Caller must ensure there are no undefined padding bits in the keys!)
+ * This is selected by specifying the HASH_BLOBS flag to hash_create.
+ *
+ * 3. More complex key behavior can be selected by specifying user-supplied
+ * hashing, comparison, and/or key-copying functions. At least a hashing
+ * function must be supplied; comparison defaults to memcmp() and key copying
+ * to memcpy() when a user-defined hashing function is selected.
+ *
+ * Compared to simplehash, dynahash has the following benefits:
+ *
+ * - It supports partitioning, which is useful for shared memory access using
+ * locks.
+ * - Shared memory hashes are allocated in a fixed size area at startup and
+ * are discoverable by name from other processes.
+ * - Because entries don't need to be moved in the case of hash conflicts,
+ * dynahash has better performance for large entries.
+ * - Guarantees stable pointers to entries.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/hash/dynahash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Original comments:
+ *
+ * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
+ * Coded into C, with minor code improvements, and with hsearch(3) interface,
+ * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
+ * also, hcreate/hdestroy routines added to simulate hsearch(3).
+ *
+ * These routines simulate hsearch(3) and family, with the important
+ * difference that the hash table is dynamic - can grow indefinitely
+ * beyond its original size (as supplied to hcreate()).
+ *
+ * Performance appears to be comparable to that of hsearch(3).
+ * The 'source-code' options referred to in hsearch(3)'s 'man' page
+ * are not implemented; otherwise functionality is identical.
+ *
+ * Compilation controls:
+ * HASH_DEBUG controls some informative traces, mainly for debugging.
+ * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
+ * when combined with HASH_DEBUG, these are displayed by hdestroy().
+ *
+ * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
+ * concatenation property, in probably unnecessary code 'optimization'.
+ *
+ * Modified margo@postgres.berkeley.edu February 1990
+ * added multiple table interface
+ * Modified by sullivan@postgres.berkeley.edu April 1990
+ * changed ctl structure for shared memory
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "access/xact.h"
+#include "common/hashfn.h"
+#include "port/pg_bitutils.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/dynahash.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Constants
+ *
+ * A hash table has a top-level "directory", each of whose entries points
+ * to a "segment" of ssize bucket headers. The maximum number of hash
+ * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
+ * the number of records in the table can be larger, but we don't want a
+ * whole lot of records per bucket or performance goes down.
+ *
+ * In a hash table allocated in shared memory, the directory cannot be
+ * expanded because it must stay at a fixed address. The directory size
+ * should be selected using hash_select_dirsize (and you'd better have
+ * a good idea of the maximum number of entries!). For non-shared hash
+ * tables, the initial directory size can be left at the default.
+ */
+#define DEF_SEGSIZE 256
+#define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
+#define DEF_DIRSIZE 256
+
+/* Number of freelists to be used for a partitioned hash table. */
+#define NUM_FREELISTS 32
+
+/* A hash bucket is a linked list of HASHELEMENTs */
+typedef HASHELEMENT *HASHBUCKET;
+
+/* A hash segment is an array of bucket headers */
+typedef HASHBUCKET *HASHSEGMENT;
+
+/*
+ * Per-freelist data.
+ *
+ * In a partitioned hash table, each freelist is associated with a specific
+ * set of hashcodes, as determined by the FREELIST_IDX() macro below.
+ * nentries tracks the number of live hashtable entries having those hashcodes
+ * (NOT the number of entries in the freelist, as you might expect).
+ *
+ * The coverage of a freelist might be more or less than one partition, so it
+ * needs its own lock rather than relying on caller locking. Relying on that
+ * wouldn't work even if the coverage was the same, because of the occasional
+ * need to "borrow" entries from another freelist; see get_hash_entry().
+ *
+ * Using an array of FreeListData instead of separate arrays of mutexes,
+ * nentries and freeLists helps to reduce sharing of cache lines between
+ * different mutexes.
+ */
+typedef struct
+{
+ slock_t mutex; /* spinlock for this freelist */
+ long nentries; /* number of entries in associated buckets */
+ HASHELEMENT *freeList; /* chain of free elements */
+} FreeListData;
+
+/*
+ * Header structure for a hash table --- contains all changeable info
+ *
+ * In a shared-memory hash table, the HASHHDR is in shared memory, while
+ * each backend has a local HTAB struct. For a non-shared table, there isn't
+ * any functional difference between HASHHDR and HTAB, but we separate them
+ * anyway to share code between shared and non-shared tables.
+ */
+struct HASHHDR
+{
+ /*
+ * The freelist can become a point of contention in high-concurrency hash
+ * tables, so we use an array of freelists, each with its own mutex and
+ * nentries count, instead of just a single one. Although the freelists
+ * normally operate independently, we will scavenge entries from freelists
+ * other than a hashcode's default freelist when necessary.
+ *
+ * If the hash table is not partitioned, only freeList[0] is used and its
+ * spinlock is not used at all; callers' locking is assumed sufficient.
+ */
+ FreeListData freeList[NUM_FREELISTS];
+
+ /* These fields can change, but not in a partitioned table */
+ /* Also, dsize can't change in a shared table, even if unpartitioned */
+ long dsize; /* directory size */
+ long nsegs; /* number of allocated segments (<= dsize) */
+ uint32 max_bucket; /* ID of maximum bucket in use */
+ uint32 high_mask; /* mask to modulo into entire table */
+ uint32 low_mask; /* mask to modulo into lower half of table */
+
+ /* These fields are fixed at hashtable creation */
+ Size keysize; /* hash key length in bytes */
+ Size entrysize; /* total user element size in bytes */
+ long num_partitions; /* # partitions (must be power of 2), or 0 */
+ long max_dsize; /* 'dsize' limit if directory is fixed size */
+ long ssize; /* segment size --- must be power of 2 */
+ int sshift; /* segment shift = log2(ssize) */
+ int nelem_alloc; /* number of entries to allocate at once */
+
+#ifdef HASH_STATISTICS
+
+ /*
+ * Count statistics here. NB: stats code doesn't bother with mutex, so
+ * counts could be corrupted a bit in a partitioned table.
+ */
+ long accesses;
+ long collisions;
+#endif
+};
+
+#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
+
+#define FREELIST_IDX(hctl, hashcode) \
+ (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
+
+/*
+ * Top control structure for a hashtable --- in a shared table, each backend
+ * has its own copy (OK since no fields change at runtime)
+ */
+struct HTAB
+{
+ HASHHDR *hctl; /* => shared control information */
+ HASHSEGMENT *dir; /* directory of segment starts */
+ HashValueFunc hash; /* hash function */
+ HashCompareFunc match; /* key comparison function */
+ HashCopyFunc keycopy; /* key copying function */
+ HashAllocFunc alloc; /* memory allocator */
+ MemoryContext hcxt; /* memory context if default allocator used */
+ char *tabname; /* table name (for error messages) */
+ bool isshared; /* true if table is in shared memory */
+ bool isfixed; /* if true, don't enlarge */
+
+ /* freezing a shared table isn't allowed, so we can keep state here */
+ bool frozen; /* true = no more inserts allowed */
+
+ /* We keep local copies of these fixed values to reduce contention */
+ Size keysize; /* hash key length in bytes */
+ long ssize; /* segment size --- must be power of 2 */
+ int sshift; /* segment shift = log2(ssize) */
+};
+
+/*
+ * Key (also entry) part of a HASHELEMENT
+ */
+#define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
+
+/*
+ * Obtain element pointer given pointer to key
+ */
+#define ELEMENT_FROM_KEY(key) \
+ ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
+
+/*
+ * Fast MOD arithmetic, assuming that y is a power of 2 !
+ */
+#define MOD(x,y) ((x) & ((y)-1))
+
+#ifdef HASH_STATISTICS
+static long hash_accesses,
+ hash_collisions,
+ hash_expansions;
+#endif
+
+/*
+ * Private function prototypes
+ */
+static void *DynaHashAlloc(Size size);
+static HASHSEGMENT seg_alloc(HTAB *hashp);
+static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
+static bool dir_realloc(HTAB *hashp);
+static bool expand_table(HTAB *hashp);
+static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
+static void hdefault(HTAB *hashp);
+static int choose_nelem_alloc(Size entrysize);
+static bool init_htab(HTAB *hashp, long nelem);
+static void hash_corrupted(HTAB *hashp);
+static long next_pow2_long(long num);
+static int next_pow2_int(long num);
+static void register_seq_scan(HTAB *hashp);
+static void deregister_seq_scan(HTAB *hashp);
+static bool has_seq_scans(HTAB *hashp);
+
+
+/*
+ * memory allocation support
+ */
+static MemoryContext CurrentDynaHashCxt = NULL;
+
+static void *
+DynaHashAlloc(Size size)
+{
+ Assert(MemoryContextIsValid(CurrentDynaHashCxt));
+ return MemoryContextAllocExtended(CurrentDynaHashCxt, size,
+ MCXT_ALLOC_NO_OOM);
+}
+
+
+/*
+ * HashCompareFunc for string keys
+ *
+ * Because we copy keys with strlcpy(), they will be truncated at keysize-1
+ * bytes, so we can only compare that many ... hence strncmp is almost but
+ * not quite the right thing.
+ */
+static int
+string_compare(const char *key1, const char *key2, Size keysize)
+{
+ return strncmp(key1, key2, keysize - 1);
+}
+
+
+/************************** CREATE ROUTINES **********************/
+
+/*
+ * hash_create -- create a new dynamic hash table
+ *
+ * tabname: a name for the table (for debugging purposes)
+ * nelem: maximum number of elements expected
+ * *info: additional table parameters, as indicated by flags
+ * flags: bitmask indicating which parameters to take from *info
+ *
+ * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
+ * optional, but the default keysize and entrysize values were useless.)
+ * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
+ * or HASH_FUNCTION, to define the key hashing semantics (C strings,
+ * binary blobs, or custom, respectively). Callers specifying a custom
+ * hash function will likely also want to use HASH_COMPARE, and perhaps
+ * also HASH_KEYCOPY, to control key comparison and copying.
+ * Another often-used flag is HASH_CONTEXT, to allocate the hash table
+ * under info->hcxt rather than under TopMemoryContext; the default
+ * behavior is only suitable for session-lifespan hash tables.
+ * Other flags bits are special-purpose and seldom used, except for those
+ * associated with shared-memory hash tables, for which see ShmemInitHash().
+ *
+ * Fields in *info are read only when the associated flags bit is set.
+ * It is not necessary to initialize other fields of *info.
+ * Neither tabname nor *info need persist after the hash_create() call.
+ *
+ * Note: It is deprecated for callers of hash_create() to explicitly specify
+ * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
+ * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
+ * one of these.
+ *
+ * Note: for a shared-memory hashtable, nelem needs to be a pretty good
+ * estimate, since we can't expand the table on the fly. But an unshared
+ * hashtable can be expanded on-the-fly, so it's better for nelem to be
+ * on the small side and let the table grow if it's exceeded. An overly
+ * large nelem will penalize hash_seq_search speed without buying much.
+ */
+HTAB *
+hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
+{
+ HTAB *hashp;
+ HASHHDR *hctl;
+
+ /*
+ * Hash tables now allocate space for key and data, but you have to say
+ * how much space to allocate.
+ */
+ Assert(flags & HASH_ELEM);
+ Assert(info->keysize > 0);
+ Assert(info->entrysize >= info->keysize);
+
+ /*
+ * For shared hash tables, we have a local hash header (HTAB struct) that
+ * we allocate in TopMemoryContext; all else is in shared memory.
+ *
+ * For non-shared hash tables, everything including the hash header is in
+ * a memory context created specially for the hash table --- this makes
+ * hash_destroy very simple. The memory context is made a child of either
+ * a context specified by the caller, or TopMemoryContext if nothing is
+ * specified.
+ */
+ if (flags & HASH_SHARED_MEM)
+ {
+ /* Set up to allocate the hash header */
+ CurrentDynaHashCxt = TopMemoryContext;
+ }
+ else
+ {
+ /* Create the hash table's private memory context */
+ if (flags & HASH_CONTEXT)
+ CurrentDynaHashCxt = info->hcxt;
+ else
+ CurrentDynaHashCxt = TopMemoryContext;
+ CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
+ "dynahash",
+ ALLOCSET_DEFAULT_SIZES);
+ }
+
+ /* Initialize the hash header, plus a copy of the table name */
+ hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
+ MemSet(hashp, 0, sizeof(HTAB));
+
+ hashp->tabname = (char *) (hashp + 1);
+ strcpy(hashp->tabname, tabname);
+
+ /* If we have a private context, label it with hashtable's name */
+ if (!(flags & HASH_SHARED_MEM))
+ MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
+
+ /*
+ * Select the appropriate hash function (see comments at head of file).
+ */
+ if (flags & HASH_FUNCTION)
+ {
+ Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
+ hashp->hash = info->hash;
+ }
+ else if (flags & HASH_BLOBS)
+ {
+ Assert(!(flags & HASH_STRINGS));
+ /* We can optimize hashing for common key sizes */
+ if (info->keysize == sizeof(uint32))
+ hashp->hash = uint32_hash;
+ else
+ hashp->hash = tag_hash;
+ }
+ else
+ {
+ /*
+ * string_hash used to be considered the default hash method, and in a
+ * non-assert build it effectively still is. But we now consider it
+ * an assertion error to not say HASH_STRINGS explicitly. To help
+ * catch mistaken usage of HASH_STRINGS, we also insist on a
+ * reasonably long string length: if the keysize is only 4 or 8 bytes,
+ * it's almost certainly an integer or pointer not a string.
+ */
+ Assert(flags & HASH_STRINGS);
+ Assert(info->keysize > 8);
+
+ hashp->hash = string_hash;
+ }
+
+ /*
+ * If you don't specify a match function, it defaults to string_compare if
+ * you used string_hash, and to memcmp otherwise.
+ *
+ * Note: explicitly specifying string_hash is deprecated, because this
+ * might not work for callers in loadable modules on some platforms due to
+ * referencing a trampoline instead of the string_hash function proper.
+ * Specify HASH_STRINGS instead.
+ */
+ if (flags & HASH_COMPARE)
+ hashp->match = info->match;
+ else if (hashp->hash == string_hash)
+ hashp->match = (HashCompareFunc) string_compare;
+ else
+ hashp->match = memcmp;
+
+ /*
+ * Similarly, the key-copying function defaults to strlcpy or memcpy.
+ */
+ if (flags & HASH_KEYCOPY)
+ hashp->keycopy = info->keycopy;
+ else if (hashp->hash == string_hash)
+ {
+ /*
+ * The signature of keycopy is meant for memcpy(), which returns
+ * void*, but strlcpy() returns size_t. Since we never use the return
+ * value of keycopy, and size_t is pretty much always the same size as
+ * void *, this should be safe. The extra cast in the middle is to
+ * avoid warnings from -Wcast-function-type.
+ */
+ hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
+ }
+ else
+ hashp->keycopy = memcpy;
+
+ /* And select the entry allocation function, too. */
+ if (flags & HASH_ALLOC)
+ hashp->alloc = info->alloc;
+ else
+ hashp->alloc = DynaHashAlloc;
+
+ if (flags & HASH_SHARED_MEM)
+ {
+ /*
+ * ctl structure and directory are preallocated for shared memory
+ * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
+ * well.
+ */
+ hashp->hctl = info->hctl;
+ hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
+ hashp->hcxt = NULL;
+ hashp->isshared = true;
+
+ /* hash table already exists, we're just attaching to it */
+ if (flags & HASH_ATTACH)
+ {
+ /* make local copies of some heavily-used values */
+ hctl = hashp->hctl;
+ hashp->keysize = hctl->keysize;
+ hashp->ssize = hctl->ssize;
+ hashp->sshift = hctl->sshift;
+
+ return hashp;
+ }
+ }
+ else
+ {
+ /* setup hash table defaults */
+ hashp->hctl = NULL;
+ hashp->dir = NULL;
+ hashp->hcxt = CurrentDynaHashCxt;
+ hashp->isshared = false;
+ }
+
+ if (!hashp->hctl)
+ {
+ hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
+ if (!hashp->hctl)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ hashp->frozen = false;
+
+ hdefault(hashp);
+
+ hctl = hashp->hctl;
+
+ if (flags & HASH_PARTITION)
+ {
+ /* Doesn't make sense to partition a local hash table */
+ Assert(flags & HASH_SHARED_MEM);
+
+ /*
+ * The number of partitions had better be a power of 2. Also, it must
+ * be less than INT_MAX (see init_htab()), so call the int version of
+ * next_pow2.
+ */
+ Assert(info->num_partitions == next_pow2_int(info->num_partitions));
+
+ hctl->num_partitions = info->num_partitions;
+ }
+
+ if (flags & HASH_SEGMENT)
+ {
+ hctl->ssize = info->ssize;
+ hctl->sshift = my_log2(info->ssize);
+ /* ssize had better be a power of 2 */
+ Assert(hctl->ssize == (1L << hctl->sshift));
+ }
+
+ /*
+ * SHM hash tables have fixed directory size passed by the caller.
+ */
+ if (flags & HASH_DIRSIZE)
+ {
+ hctl->max_dsize = info->max_dsize;
+ hctl->dsize = info->dsize;
+ }
+
+ /* remember the entry sizes, too */
+ hctl->keysize = info->keysize;
+ hctl->entrysize = info->entrysize;
+
+ /* make local copies of heavily-used constant fields */
+ hashp->keysize = hctl->keysize;
+ hashp->ssize = hctl->ssize;
+ hashp->sshift = hctl->sshift;
+
+ /* Build the hash directory structure */
+ if (!init_htab(hashp, nelem))
+ elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
+
+ /*
+ * For a shared hash table, preallocate the requested number of elements.
+ * This reduces problems with run-time out-of-shared-memory conditions.
+ *
+ * For a non-shared hash table, preallocate the requested number of
+ * elements if it's less than our chosen nelem_alloc. This avoids wasting
+ * space if the caller correctly estimates a small table size.
+ */
+ if ((flags & HASH_SHARED_MEM) ||
+ nelem < hctl->nelem_alloc)
+ {
+ int i,
+ freelist_partitions,
+ nelem_alloc,
+ nelem_alloc_first;
+
+ /*
+ * If hash table is partitioned, give each freelist an equal share of
+ * the initial allocation. Otherwise only freeList[0] is used.
+ */
+ if (IS_PARTITIONED(hashp->hctl))
+ freelist_partitions = NUM_FREELISTS;
+ else
+ freelist_partitions = 1;
+
+ nelem_alloc = nelem / freelist_partitions;
+ if (nelem_alloc <= 0)
+ nelem_alloc = 1;
+
+ /*
+ * Make sure we'll allocate all the requested elements; freeList[0]
+ * gets the excess if the request isn't divisible by NUM_FREELISTS.
+ */
+ if (nelem_alloc * freelist_partitions < nelem)
+ nelem_alloc_first =
+ nelem - nelem_alloc * (freelist_partitions - 1);
+ else
+ nelem_alloc_first = nelem_alloc;
+
+ for (i = 0; i < freelist_partitions; i++)
+ {
+ int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
+
+ if (!element_alloc(hashp, temp, i))
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ }
+
+ if (flags & HASH_FIXED_SIZE)
+ hashp->isfixed = true;
+ return hashp;
+}
+
+/*
+ * Set default HASHHDR parameters.
+ */
+static void
+hdefault(HTAB *hashp)
+{
+ HASHHDR *hctl = hashp->hctl;
+
+ MemSet(hctl, 0, sizeof(HASHHDR));
+
+ hctl->dsize = DEF_DIRSIZE;
+ hctl->nsegs = 0;
+
+ hctl->num_partitions = 0; /* not partitioned */
+
+ /* table has no fixed maximum size */
+ hctl->max_dsize = NO_MAX_DSIZE;
+
+ hctl->ssize = DEF_SEGSIZE;
+ hctl->sshift = DEF_SEGSIZE_SHIFT;
+
+#ifdef HASH_STATISTICS
+ hctl->accesses = hctl->collisions = 0;
+#endif
+}
+
+/*
+ * Given the user-specified entry size, choose nelem_alloc, ie, how many
+ * elements to add to the hash table when we need more.
+ */
+static int
+choose_nelem_alloc(Size entrysize)
+{
+ int nelem_alloc;
+ Size elementSize;
+ Size allocSize;
+
+ /* Each element has a HASHELEMENT header plus user data. */
+ /* NB: this had better match element_alloc() */
+ elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
+
+ /*
+ * The idea here is to choose nelem_alloc at least 32, but round up so
+ * that the allocation request will be a power of 2 or just less. This
+ * makes little difference for hash tables in shared memory, but for hash
+ * tables managed by palloc, the allocation request will be rounded up to
+ * a power of 2 anyway. If we fail to take this into account, we'll waste
+ * as much as half the allocated space.
+ */
+ allocSize = 32 * 4; /* assume elementSize at least 8 */
+ do
+ {
+ allocSize <<= 1;
+ nelem_alloc = allocSize / elementSize;
+ } while (nelem_alloc < 32);
+
+ return nelem_alloc;
+}
+
+/*
+ * Compute derived fields of hctl and build the initial directory/segment
+ * arrays
+ */
+static bool
+init_htab(HTAB *hashp, long nelem)
+{
+ HASHHDR *hctl = hashp->hctl;
+ HASHSEGMENT *segp;
+ int nbuckets;
+ int nsegs;
+ int i;
+
+ /*
+ * initialize mutexes if it's a partitioned table
+ */
+ if (IS_PARTITIONED(hctl))
+ for (i = 0; i < NUM_FREELISTS; i++)
+ SpinLockInit(&(hctl->freeList[i].mutex));
+
+ /*
+ * Allocate space for the next greater power of two number of buckets,
+ * assuming a desired maximum load factor of 1.
+ */
+ nbuckets = next_pow2_int(nelem);
+
+ /*
+ * In a partitioned table, nbuckets must be at least equal to
+ * num_partitions; were it less, keys with apparently different partition
+ * numbers would map to the same bucket, breaking partition independence.
+ * (Normally nbuckets will be much bigger; this is just a safety check.)
+ */
+ while (nbuckets < hctl->num_partitions)
+ nbuckets <<= 1;
+
+ hctl->max_bucket = hctl->low_mask = nbuckets - 1;
+ hctl->high_mask = (nbuckets << 1) - 1;
+
+ /*
+ * Figure number of directory segments needed, round up to a power of 2
+ */
+ nsegs = (nbuckets - 1) / hctl->ssize + 1;
+ nsegs = next_pow2_int(nsegs);
+
+ /*
+ * Make sure directory is big enough. If pre-allocated directory is too
+ * small, choke (caller screwed up).
+ */
+ if (nsegs > hctl->dsize)
+ {
+ if (!(hashp->dir))
+ hctl->dsize = nsegs;
+ else
+ return false;
+ }
+
+ /* Allocate a directory */
+ if (!(hashp->dir))
+ {
+ CurrentDynaHashCxt = hashp->hcxt;
+ hashp->dir = (HASHSEGMENT *)
+ hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
+ if (!hashp->dir)
+ return false;
+ }
+
+ /* Allocate initial segments */
+ for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
+ {
+ *segp = seg_alloc(hashp);
+ if (*segp == NULL)
+ return false;
+ }
+
+ /* Choose number of entries to allocate at a time */
+ hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
+
+#ifdef HASH_DEBUG
+ fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
+ "TABLE POINTER ", hashp,
+ "DIRECTORY SIZE ", hctl->dsize,
+ "SEGMENT SIZE ", hctl->ssize,
+ "SEGMENT SHIFT ", hctl->sshift,
+ "MAX BUCKET ", hctl->max_bucket,
+ "HIGH MASK ", hctl->high_mask,
+ "LOW MASK ", hctl->low_mask,
+ "NSEGS ", hctl->nsegs);
+#endif
+ return true;
+}
+
+/*
+ * Estimate the space needed for a hashtable containing the given number
+ * of entries of given size.
+ * NOTE: this is used to estimate the footprint of hashtables in shared
+ * memory; therefore it does not count HTAB which is in local memory.
+ * NB: assumes that all hash structure parameters have default values!
+ */
+Size
+hash_estimate_size(long num_entries, Size entrysize)
+{
+ Size size;
+ long nBuckets,
+ nSegments,
+ nDirEntries,
+ nElementAllocs,
+ elementSize,
+ elementAllocCnt;
+
+ /* estimate number of buckets wanted */
+ nBuckets = next_pow2_long(num_entries);
+ /* # of segments needed for nBuckets */
+ nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
+ /* directory entries */
+ nDirEntries = DEF_DIRSIZE;
+ while (nDirEntries < nSegments)
+ nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
+
+ /* fixed control info */
+ size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
+ /* directory */
+ size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
+ /* segments */
+ size = add_size(size, mul_size(nSegments,
+ MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
+ /* elements --- allocated in groups of choose_nelem_alloc() entries */
+ elementAllocCnt = choose_nelem_alloc(entrysize);
+ nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
+ elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
+ size = add_size(size,
+ mul_size(nElementAllocs,
+ mul_size(elementAllocCnt, elementSize)));
+
+ return size;
+}
+
+/*
+ * Select an appropriate directory size for a hashtable with the given
+ * maximum number of entries.
+ * This is only needed for hashtables in shared memory, whose directories
+ * cannot be expanded dynamically.
+ * NB: assumes that all hash structure parameters have default values!
+ *
+ * XXX this had better agree with the behavior of init_htab()...
+ */
+long
+hash_select_dirsize(long num_entries)
+{
+ long nBuckets,
+ nSegments,
+ nDirEntries;
+
+ /* estimate number of buckets wanted */
+ nBuckets = next_pow2_long(num_entries);
+ /* # of segments needed for nBuckets */
+ nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
+ /* directory entries */
+ nDirEntries = DEF_DIRSIZE;
+ while (nDirEntries < nSegments)
+ nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
+
+ return nDirEntries;
+}
+
+/*
+ * Compute the required initial memory allocation for a shared-memory
+ * hashtable with the given parameters. We need space for the HASHHDR
+ * and for the (non expansible) directory.
+ */
+Size
+hash_get_shared_size(HASHCTL *info, int flags)
+{
+ Assert(flags & HASH_DIRSIZE);
+ Assert(info->dsize == info->max_dsize);
+ return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
+}
+
+
+/********************** DESTROY ROUTINES ************************/
+
+void
+hash_destroy(HTAB *hashp)
+{
+ if (hashp != NULL)
+ {
+ /* allocation method must be one we know how to free, too */
+ Assert(hashp->alloc == DynaHashAlloc);
+ /* so this hashtable must have its own context */
+ Assert(hashp->hcxt != NULL);
+
+ hash_stats("destroy", hashp);
+
+ /*
+ * Free everything by destroying the hash table's memory context.
+ */
+ MemoryContextDelete(hashp->hcxt);
+ }
+}
+
+void
+hash_stats(const char *where, HTAB *hashp)
+{
+#ifdef HASH_STATISTICS
+ fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
+ where, hashp->hctl->accesses, hashp->hctl->collisions);
+
+ fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
+ hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
+ hashp->hctl->max_bucket, hashp->hctl->nsegs);
+ fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
+ where, hash_accesses, hash_collisions);
+ fprintf(stderr, "hash_stats: total expansions %ld\n",
+ hash_expansions);
+#endif
+}
+
+/*******************************SEARCH ROUTINES *****************************/
+
+
+/*
+ * get_hash_value -- exported routine to calculate a key's hash value
+ *
+ * We export this because for partitioned tables, callers need to compute
+ * the partition number (from the low-order bits of the hash value) before
+ * searching.
+ */
+uint32
+get_hash_value(HTAB *hashp, const void *keyPtr)
+{
+ return hashp->hash(keyPtr, hashp->keysize);
+}
+
+/* Convert a hash value to a bucket number */
+static inline uint32
+calc_bucket(HASHHDR *hctl, uint32 hash_val)
+{
+ uint32 bucket;
+
+ bucket = hash_val & hctl->high_mask;
+ if (bucket > hctl->max_bucket)
+ bucket = bucket & hctl->low_mask;
+
+ return bucket;
+}
+
+/*
+ * hash_search -- look up key in table and perform action
+ * hash_search_with_hash_value -- same, with key's hash value already computed
+ *
+ * action is one of:
+ * HASH_FIND: look up key in table
+ * HASH_ENTER: look up key in table, creating entry if not present
+ * HASH_ENTER_NULL: same, but return NULL if out of memory
+ * HASH_REMOVE: look up key in table, remove entry if present
+ *
+ * Return value is a pointer to the element found/entered/removed if any,
+ * or NULL if no match was found. (NB: in the case of the REMOVE action,
+ * the result is a dangling pointer that shouldn't be dereferenced!)
+ *
+ * HASH_ENTER will normally ereport a generic "out of memory" error if
+ * it is unable to create a new entry. The HASH_ENTER_NULL operation is
+ * the same except it will return NULL if out of memory.
+ *
+ * If foundPtr isn't NULL, then *foundPtr is set true if we found an
+ * existing entry in the table, false otherwise. This is needed in the
+ * HASH_ENTER case, but is redundant with the return value otherwise.
+ *
+ * For hash_search_with_hash_value, the hashvalue parameter must have been
+ * calculated with get_hash_value().
+ */
+void *
+hash_search(HTAB *hashp,
+ const void *keyPtr,
+ HASHACTION action,
+ bool *foundPtr)
+{
+ return hash_search_with_hash_value(hashp,
+ keyPtr,
+ hashp->hash(keyPtr, hashp->keysize),
+ action,
+ foundPtr);
+}
+
+void *
+hash_search_with_hash_value(HTAB *hashp,
+ const void *keyPtr,
+ uint32 hashvalue,
+ HASHACTION action,
+ bool *foundPtr)
+{
+ HASHHDR *hctl = hashp->hctl;
+ int freelist_idx = FREELIST_IDX(hctl, hashvalue);
+ Size keysize;
+ uint32 bucket;
+ long segment_num;
+ long segment_ndx;
+ HASHSEGMENT segp;
+ HASHBUCKET currBucket;
+ HASHBUCKET *prevBucketPtr;
+ HashCompareFunc match;
+
+#ifdef HASH_STATISTICS
+ hash_accesses++;
+ hctl->accesses++;
+#endif
+
+ /*
+ * If inserting, check if it is time to split a bucket.
+ *
+ * NOTE: failure to expand table is not a fatal error, it just means we
+ * have to run at higher fill factor than we wanted. However, if we're
+ * using the palloc allocator then it will throw error anyway on
+ * out-of-memory, so we must do this before modifying the table.
+ */
+ if (action == HASH_ENTER || action == HASH_ENTER_NULL)
+ {
+ /*
+ * Can't split if running in partitioned mode, nor if frozen, nor if
+ * table is the subject of any active hash_seq_search scans.
+ */
+ if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
+ !IS_PARTITIONED(hctl) && !hashp->frozen &&
+ !has_seq_scans(hashp))
+ (void) expand_table(hashp);
+ }
+
+ /*
+ * Do the initial lookup
+ */
+ bucket = calc_bucket(hctl, hashvalue);
+
+ segment_num = bucket >> hashp->sshift;
+ segment_ndx = MOD(bucket, hashp->ssize);
+
+ segp = hashp->dir[segment_num];
+
+ if (segp == NULL)
+ hash_corrupted(hashp);
+
+ prevBucketPtr = &segp[segment_ndx];
+ currBucket = *prevBucketPtr;
+
+ /*
+ * Follow collision chain looking for matching key
+ */
+ match = hashp->match; /* save one fetch in inner loop */
+ keysize = hashp->keysize; /* ditto */
+
+ while (currBucket != NULL)
+ {
+ if (currBucket->hashvalue == hashvalue &&
+ match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
+ break;
+ prevBucketPtr = &(currBucket->link);
+ currBucket = *prevBucketPtr;
+#ifdef HASH_STATISTICS
+ hash_collisions++;
+ hctl->collisions++;
+#endif
+ }
+
+ if (foundPtr)
+ *foundPtr = (bool) (currBucket != NULL);
+
+ /*
+ * OK, now what?
+ */
+ switch (action)
+ {
+ case HASH_FIND:
+ if (currBucket != NULL)
+ return (void *) ELEMENTKEY(currBucket);
+ return NULL;
+
+ case HASH_REMOVE:
+ if (currBucket != NULL)
+ {
+ /* if partitioned, must lock to touch nentries and freeList */
+ if (IS_PARTITIONED(hctl))
+ SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
+
+ /* delete the record from the appropriate nentries counter. */
+ Assert(hctl->freeList[freelist_idx].nentries > 0);
+ hctl->freeList[freelist_idx].nentries--;
+
+ /* remove record from hash bucket's chain. */
+ *prevBucketPtr = currBucket->link;
+
+ /* add the record to the appropriate freelist. */
+ currBucket->link = hctl->freeList[freelist_idx].freeList;
+ hctl->freeList[freelist_idx].freeList = currBucket;
+
+ if (IS_PARTITIONED(hctl))
+ SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
+
+ /*
+ * better hope the caller is synchronizing access to this
+ * element, because someone else is going to reuse it the next
+ * time something is added to the table
+ */
+ return (void *) ELEMENTKEY(currBucket);
+ }
+ return NULL;
+
+ case HASH_ENTER:
+ case HASH_ENTER_NULL:
+ /* Return existing element if found, else create one */
+ if (currBucket != NULL)
+ return (void *) ELEMENTKEY(currBucket);
+
+ /* disallow inserts if frozen */
+ if (hashp->frozen)
+ elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
+ hashp->tabname);
+
+ currBucket = get_hash_entry(hashp, freelist_idx);
+ if (currBucket == NULL)
+ {
+ /* out of memory */
+ if (action == HASH_ENTER_NULL)
+ return NULL;
+ /* report a generic message */
+ if (hashp->isshared)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory")));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /* link into hashbucket chain */
+ *prevBucketPtr = currBucket;
+ currBucket->link = NULL;
+
+ /* copy key into record */
+ currBucket->hashvalue = hashvalue;
+ hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
+
+ /*
+ * Caller is expected to fill the data field on return. DO NOT
+ * insert any code that could possibly throw error here, as doing
+ * so would leave the table entry incomplete and hence corrupt the
+ * caller's data structure.
+ */
+
+ return (void *) ELEMENTKEY(currBucket);
+ }
+
+ elog(ERROR, "unrecognized hash action code: %d", (int) action);
+
+ return NULL; /* keep compiler quiet */
+}
+
+/*
+ * hash_update_hash_key -- change the hash key of an existing table entry
+ *
+ * This is equivalent to removing the entry, making a new entry, and copying
+ * over its data, except that the entry never goes to the table's freelist.
+ * Therefore this cannot suffer an out-of-memory failure, even if there are
+ * other processes operating in other partitions of the hashtable.
+ *
+ * Returns true if successful, false if the requested new hash key is already
+ * present. Throws error if the specified entry pointer isn't actually a
+ * table member.
+ *
+ * NB: currently, there is no special case for old and new hash keys being
+ * identical, which means we'll report false for that situation. This is
+ * preferable for existing uses.
+ *
+ * NB: for a partitioned hashtable, caller must hold lock on both relevant
+ * partitions, if the new hash key would belong to a different partition.
+ */
+bool
+hash_update_hash_key(HTAB *hashp,
+ void *existingEntry,
+ const void *newKeyPtr)
+{
+ HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
+ HASHHDR *hctl = hashp->hctl;
+ uint32 newhashvalue;
+ Size keysize;
+ uint32 bucket;
+ uint32 newbucket;
+ long segment_num;
+ long segment_ndx;
+ HASHSEGMENT segp;
+ HASHBUCKET currBucket;
+ HASHBUCKET *prevBucketPtr;
+ HASHBUCKET *oldPrevPtr;
+ HashCompareFunc match;
+
+#ifdef HASH_STATISTICS
+ hash_accesses++;
+ hctl->accesses++;
+#endif
+
+ /* disallow updates if frozen */
+ if (hashp->frozen)
+ elog(ERROR, "cannot update in frozen hashtable \"%s\"",
+ hashp->tabname);
+
+ /*
+ * Lookup the existing element using its saved hash value. We need to do
+ * this to be able to unlink it from its hash chain, but as a side benefit
+ * we can verify the validity of the passed existingEntry pointer.
+ */
+ bucket = calc_bucket(hctl, existingElement->hashvalue);
+
+ segment_num = bucket >> hashp->sshift;
+ segment_ndx = MOD(bucket, hashp->ssize);
+
+ segp = hashp->dir[segment_num];
+
+ if (segp == NULL)
+ hash_corrupted(hashp);
+
+ prevBucketPtr = &segp[segment_ndx];
+ currBucket = *prevBucketPtr;
+
+ while (currBucket != NULL)
+ {
+ if (currBucket == existingElement)
+ break;
+ prevBucketPtr = &(currBucket->link);
+ currBucket = *prevBucketPtr;
+ }
+
+ if (currBucket == NULL)
+ elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
+ hashp->tabname);
+
+ oldPrevPtr = prevBucketPtr;
+
+ /*
+ * Now perform the equivalent of a HASH_ENTER operation to locate the hash
+ * chain we want to put the entry into.
+ */
+ newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
+
+ newbucket = calc_bucket(hctl, newhashvalue);
+
+ segment_num = newbucket >> hashp->sshift;
+ segment_ndx = MOD(newbucket, hashp->ssize);
+
+ segp = hashp->dir[segment_num];
+
+ if (segp == NULL)
+ hash_corrupted(hashp);
+
+ prevBucketPtr = &segp[segment_ndx];
+ currBucket = *prevBucketPtr;
+
+ /*
+ * Follow collision chain looking for matching key
+ */
+ match = hashp->match; /* save one fetch in inner loop */
+ keysize = hashp->keysize; /* ditto */
+
+ while (currBucket != NULL)
+ {
+ if (currBucket->hashvalue == newhashvalue &&
+ match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
+ break;
+ prevBucketPtr = &(currBucket->link);
+ currBucket = *prevBucketPtr;
+#ifdef HASH_STATISTICS
+ hash_collisions++;
+ hctl->collisions++;
+#endif
+ }
+
+ if (currBucket != NULL)
+ return false; /* collision with an existing entry */
+
+ currBucket = existingElement;
+
+ /*
+ * If old and new hash values belong to the same bucket, we need not
+ * change any chain links, and indeed should not since this simplistic
+ * update will corrupt the list if currBucket is the last element. (We
+ * cannot fall out earlier, however, since we need to scan the bucket to
+ * check for duplicate keys.)
+ */
+ if (bucket != newbucket)
+ {
+ /* OK to remove record from old hash bucket's chain. */
+ *oldPrevPtr = currBucket->link;
+
+ /* link into new hashbucket chain */
+ *prevBucketPtr = currBucket;
+ currBucket->link = NULL;
+ }
+
+ /* copy new key into record */
+ currBucket->hashvalue = newhashvalue;
+ hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
+
+ /* rest of record is untouched */
+
+ return true;
+}
+
+/*
+ * Allocate a new hashtable entry if possible; return NULL if out of memory.
+ * (Or, if the underlying space allocator throws error for out-of-memory,
+ * we won't return at all.)
+ */
+static HASHBUCKET
+get_hash_entry(HTAB *hashp, int freelist_idx)
+{
+ HASHHDR *hctl = hashp->hctl;
+ HASHBUCKET newElement;
+
+ for (;;)
+ {
+ /* if partitioned, must lock to touch nentries and freeList */
+ if (IS_PARTITIONED(hctl))
+ SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
+
+ /* try to get an entry from the freelist */
+ newElement = hctl->freeList[freelist_idx].freeList;
+
+ if (newElement != NULL)
+ break;
+
+ if (IS_PARTITIONED(hctl))
+ SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
+
+ /*
+ * No free elements in this freelist. In a partitioned table, there
+ * might be entries in other freelists, but to reduce contention we
+ * prefer to first try to get another chunk of buckets from the main
+ * shmem allocator. If that fails, though, we *MUST* root through all
+ * the other freelists before giving up. There are multiple callers
+ * that assume that they can allocate every element in the initially
+ * requested table size, or that deleting an element guarantees they
+ * can insert a new element, even if shared memory is entirely full.
+ * Failing because the needed element is in a different freelist is
+ * not acceptable.
+ */
+ if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
+ {
+ int borrow_from_idx;
+
+ if (!IS_PARTITIONED(hctl))
+ return NULL; /* out of memory */
+
+ /* try to borrow element from another freelist */
+ borrow_from_idx = freelist_idx;
+ for (;;)
+ {
+ borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
+ if (borrow_from_idx == freelist_idx)
+ break; /* examined all freelists, fail */
+
+ SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
+ newElement = hctl->freeList[borrow_from_idx].freeList;
+
+ if (newElement != NULL)
+ {
+ hctl->freeList[borrow_from_idx].freeList = newElement->link;
+ SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
+
+ /* careful: count the new element in its proper freelist */
+ SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
+ hctl->freeList[freelist_idx].nentries++;
+ SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
+
+ return newElement;
+ }
+
+ SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
+ }
+
+ /* no elements available to borrow either, so out of memory */
+ return NULL;
+ }
+ }
+
+ /* remove entry from freelist, bump nentries */
+ hctl->freeList[freelist_idx].freeList = newElement->link;
+ hctl->freeList[freelist_idx].nentries++;
+
+ if (IS_PARTITIONED(hctl))
+ SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
+
+ return newElement;
+}
+
+/*
+ * hash_get_num_entries -- get the number of entries in a hashtable
+ */
+long
+hash_get_num_entries(HTAB *hashp)
+{
+ int i;
+ long sum = hashp->hctl->freeList[0].nentries;
+
+ /*
+ * We currently don't bother with acquiring the mutexes; it's only
+ * sensible to call this function if you've got lock on all partitions of
+ * the table.
+ */
+ if (IS_PARTITIONED(hashp->hctl))
+ {
+ for (i = 1; i < NUM_FREELISTS; i++)
+ sum += hashp->hctl->freeList[i].nentries;
+ }
+
+ return sum;
+}
+
+/*
+ * hash_seq_init/_search/_term
+ * Sequentially search through hash table and return
+ * all the elements one by one, return NULL when no more.
+ *
+ * hash_seq_term should be called if and only if the scan is abandoned before
+ * completion; if hash_seq_search returns NULL then it has already done the
+ * end-of-scan cleanup.
+ *
+ * NOTE: caller may delete the returned element before continuing the scan.
+ * However, deleting any other element while the scan is in progress is
+ * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
+ * if elements are added to the table while the scan is in progress, it is
+ * unspecified whether they will be visited by the scan or not.
+ *
+ * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
+ * worry about hash_seq_term cleanup, if the hashtable is first locked against
+ * further insertions by calling hash_freeze.
+ *
+ * NOTE: to use this with a partitioned hashtable, caller had better hold
+ * at least shared lock on all partitions of the table throughout the scan!
+ * We can cope with insertions or deletions by our own backend, but *not*
+ * with concurrent insertions or deletions by another.
+ */
+void
+hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
+{
+ status->hashp = hashp;
+ status->curBucket = 0;
+ status->curEntry = NULL;
+ if (!hashp->frozen)
+ register_seq_scan(hashp);
+}
+
+void *
+hash_seq_search(HASH_SEQ_STATUS *status)
+{
+ HTAB *hashp;
+ HASHHDR *hctl;
+ uint32 max_bucket;
+ long ssize;
+ long segment_num;
+ long segment_ndx;
+ HASHSEGMENT segp;
+ uint32 curBucket;
+ HASHELEMENT *curElem;
+
+ if ((curElem = status->curEntry) != NULL)
+ {
+ /* Continuing scan of curBucket... */
+ status->curEntry = curElem->link;
+ if (status->curEntry == NULL) /* end of this bucket */
+ ++status->curBucket;
+ return (void *) ELEMENTKEY(curElem);
+ }
+
+ /*
+ * Search for next nonempty bucket starting at curBucket.
+ */
+ curBucket = status->curBucket;
+ hashp = status->hashp;
+ hctl = hashp->hctl;
+ ssize = hashp->ssize;
+ max_bucket = hctl->max_bucket;
+
+ if (curBucket > max_bucket)
+ {
+ hash_seq_term(status);
+ return NULL; /* search is done */
+ }
+
+ /*
+ * first find the right segment in the table directory.
+ */
+ segment_num = curBucket >> hashp->sshift;
+ segment_ndx = MOD(curBucket, ssize);
+
+ segp = hashp->dir[segment_num];
+
+ /*
+ * Pick up the first item in this bucket's chain. If chain is not empty
+ * we can begin searching it. Otherwise we have to advance to find the
+ * next nonempty bucket. We try to optimize that case since searching a
+ * near-empty hashtable has to iterate this loop a lot.
+ */
+ while ((curElem = segp[segment_ndx]) == NULL)
+ {
+ /* empty bucket, advance to next */
+ if (++curBucket > max_bucket)
+ {
+ status->curBucket = curBucket;
+ hash_seq_term(status);
+ return NULL; /* search is done */
+ }
+ if (++segment_ndx >= ssize)
+ {
+ segment_num++;
+ segment_ndx = 0;
+ segp = hashp->dir[segment_num];
+ }
+ }
+
+ /* Begin scan of curBucket... */
+ status->curEntry = curElem->link;
+ if (status->curEntry == NULL) /* end of this bucket */
+ ++curBucket;
+ status->curBucket = curBucket;
+ return (void *) ELEMENTKEY(curElem);
+}
+
+void
+hash_seq_term(HASH_SEQ_STATUS *status)
+{
+ if (!status->hashp->frozen)
+ deregister_seq_scan(status->hashp);
+}
+
+/*
+ * hash_freeze
+ * Freeze a hashtable against future insertions (deletions are
+ * still allowed)
+ *
+ * The reason for doing this is that by preventing any more bucket splits,
+ * we no longer need to worry about registering hash_seq_search scans,
+ * and thus caller need not be careful about ensuring hash_seq_term gets
+ * called at the right times.
+ *
+ * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
+ * with active scans (since hash_seq_term would then do the wrong thing).
+ */
+void
+hash_freeze(HTAB *hashp)
+{
+ if (hashp->isshared)
+ elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
+ if (!hashp->frozen && has_seq_scans(hashp))
+ elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
+ hashp->tabname);
+ hashp->frozen = true;
+}
+
+
+/********************************* UTILITIES ************************/
+
+/*
+ * Expand the table by adding one more hash bucket.
+ */
+static bool
+expand_table(HTAB *hashp)
+{
+ HASHHDR *hctl = hashp->hctl;
+ HASHSEGMENT old_seg,
+ new_seg;
+ long old_bucket,
+ new_bucket;
+ long new_segnum,
+ new_segndx;
+ long old_segnum,
+ old_segndx;
+ HASHBUCKET *oldlink,
+ *newlink;
+ HASHBUCKET currElement,
+ nextElement;
+
+ Assert(!IS_PARTITIONED(hctl));
+
+#ifdef HASH_STATISTICS
+ hash_expansions++;
+#endif
+
+ new_bucket = hctl->max_bucket + 1;
+ new_segnum = new_bucket >> hashp->sshift;
+ new_segndx = MOD(new_bucket, hashp->ssize);
+
+ if (new_segnum >= hctl->nsegs)
+ {
+ /* Allocate new segment if necessary -- could fail if dir full */
+ if (new_segnum >= hctl->dsize)
+ if (!dir_realloc(hashp))
+ return false;
+ if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
+ return false;
+ hctl->nsegs++;
+ }
+
+ /* OK, we created a new bucket */
+ hctl->max_bucket++;
+
+ /*
+ * *Before* changing masks, find old bucket corresponding to same hash
+ * values; values in that bucket may need to be relocated to new bucket.
+ * Note that new_bucket is certainly larger than low_mask at this point,
+ * so we can skip the first step of the regular hash mask calc.
+ */
+ old_bucket = (new_bucket & hctl->low_mask);
+
+ /*
+ * If we crossed a power of 2, readjust masks.
+ */
+ if ((uint32) new_bucket > hctl->high_mask)
+ {
+ hctl->low_mask = hctl->high_mask;
+ hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
+ }
+
+ /*
+ * Relocate records to the new bucket. NOTE: because of the way the hash
+ * masking is done in calc_bucket, only one old bucket can need to be
+ * split at this point. With a different way of reducing the hash value,
+ * that might not be true!
+ */
+ old_segnum = old_bucket >> hashp->sshift;
+ old_segndx = MOD(old_bucket, hashp->ssize);
+
+ old_seg = hashp->dir[old_segnum];
+ new_seg = hashp->dir[new_segnum];
+
+ oldlink = &old_seg[old_segndx];
+ newlink = &new_seg[new_segndx];
+
+ for (currElement = *oldlink;
+ currElement != NULL;
+ currElement = nextElement)
+ {
+ nextElement = currElement->link;
+ if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
+ {
+ *oldlink = currElement;
+ oldlink = &currElement->link;
+ }
+ else
+ {
+ *newlink = currElement;
+ newlink = &currElement->link;
+ }
+ }
+ /* don't forget to terminate the rebuilt hash chains... */
+ *oldlink = NULL;
+ *newlink = NULL;
+
+ return true;
+}
+
+
+static bool
+dir_realloc(HTAB *hashp)
+{
+ HASHSEGMENT *p;
+ HASHSEGMENT *old_p;
+ long new_dsize;
+ long old_dirsize;
+ long new_dirsize;
+
+ if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
+ return false;
+
+ /* Reallocate directory */
+ new_dsize = hashp->hctl->dsize << 1;
+ old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
+ new_dirsize = new_dsize * sizeof(HASHSEGMENT);
+
+ old_p = hashp->dir;
+ CurrentDynaHashCxt = hashp->hcxt;
+ p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
+
+ if (p != NULL)
+ {
+ memcpy(p, old_p, old_dirsize);
+ MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
+ hashp->dir = p;
+ hashp->hctl->dsize = new_dsize;
+
+ /* XXX assume the allocator is palloc, so we know how to free */
+ Assert(hashp->alloc == DynaHashAlloc);
+ pfree(old_p);
+
+ return true;
+ }
+
+ return false;
+}
+
+
+static HASHSEGMENT
+seg_alloc(HTAB *hashp)
+{
+ HASHSEGMENT segp;
+
+ CurrentDynaHashCxt = hashp->hcxt;
+ segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
+
+ if (!segp)
+ return NULL;
+
+ MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
+
+ return segp;
+}
+
+/*
+ * allocate some new elements and link them into the indicated free list
+ */
+static bool
+element_alloc(HTAB *hashp, int nelem, int freelist_idx)
+{
+ HASHHDR *hctl = hashp->hctl;
+ Size elementSize;
+ HASHELEMENT *firstElement;
+ HASHELEMENT *tmpElement;
+ HASHELEMENT *prevElement;
+ int i;
+
+ if (hashp->isfixed)
+ return false;
+
+ /* Each element has a HASHELEMENT header plus user data. */
+ elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
+
+ CurrentDynaHashCxt = hashp->hcxt;
+ firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
+
+ if (!firstElement)
+ return false;
+
+ /* prepare to link all the new entries into the freelist */
+ prevElement = NULL;
+ tmpElement = firstElement;
+ for (i = 0; i < nelem; i++)
+ {
+ tmpElement->link = prevElement;
+ prevElement = tmpElement;
+ tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
+ }
+
+ /* if partitioned, must lock to touch freeList */
+ if (IS_PARTITIONED(hctl))
+ SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
+
+ /* freelist could be nonempty if two backends did this concurrently */
+ firstElement->link = hctl->freeList[freelist_idx].freeList;
+ hctl->freeList[freelist_idx].freeList = prevElement;
+
+ if (IS_PARTITIONED(hctl))
+ SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
+
+ return true;
+}
+
+/* complain when we have detected a corrupted hashtable */
+static void
+hash_corrupted(HTAB *hashp)
+{
+ /*
+ * If the corruption is in a shared hashtable, we'd better force a
+ * systemwide restart. Otherwise, just shut down this one backend.
+ */
+ if (hashp->isshared)
+ elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
+ else
+ elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
+}
+
+/* calculate ceil(log base 2) of num */
+int
+my_log2(long num)
+{
+ /*
+ * guard against too-large input, which would be invalid for
+ * pg_ceil_log2_*()
+ */
+ if (num > LONG_MAX / 2)
+ num = LONG_MAX / 2;
+
+#if SIZEOF_LONG < 8
+ return pg_ceil_log2_32(num);
+#else
+ return pg_ceil_log2_64(num);
+#endif
+}
+
+/* calculate first power of 2 >= num, bounded to what will fit in a long */
+static long
+next_pow2_long(long num)
+{
+ /* my_log2's internal range check is sufficient */
+ return 1L << my_log2(num);
+}
+
+/* calculate first power of 2 >= num, bounded to what will fit in an int */
+static int
+next_pow2_int(long num)
+{
+ if (num > INT_MAX / 2)
+ num = INT_MAX / 2;
+ return 1 << my_log2(num);
+}
+
+
+/************************* SEQ SCAN TRACKING ************************/
+
+/*
+ * We track active hash_seq_search scans here. The need for this mechanism
+ * comes from the fact that a scan will get confused if a bucket split occurs
+ * while it's in progress: it might visit entries twice, or even miss some
+ * entirely (if it's partway through the same bucket that splits). Hence
+ * we want to inhibit bucket splits if there are any active scans on the
+ * table being inserted into. This is a fairly rare case in current usage,
+ * so just postponing the split until the next insertion seems sufficient.
+ *
+ * Given present usages of the function, only a few scans are likely to be
+ * open concurrently; so a finite-size stack of open scans seems sufficient,
+ * and we don't worry that linear search is too slow. Note that we do
+ * allow multiple scans of the same hashtable to be open concurrently.
+ *
+ * This mechanism can support concurrent scan and insertion in a shared
+ * hashtable if it's the same backend doing both. It would fail otherwise,
+ * but locking reasons seem to preclude any such scenario anyway, so we don't
+ * worry.
+ *
+ * This arrangement is reasonably robust if a transient hashtable is deleted
+ * without notifying us. The absolute worst case is we might inhibit splits
+ * in another table created later at exactly the same address. We will give
+ * a warning at transaction end for reference leaks, so any bugs leading to
+ * lack of notification should be easy to catch.
+ */
+
+#define MAX_SEQ_SCANS 100
+
+static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
+static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
+static int num_seq_scans = 0;
+
+
+/* Register a table as having an active hash_seq_search scan */
+static void
+register_seq_scan(HTAB *hashp)
+{
+ if (num_seq_scans >= MAX_SEQ_SCANS)
+ elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
+ hashp->tabname);
+ seq_scan_tables[num_seq_scans] = hashp;
+ seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
+ num_seq_scans++;
+}
+
+/* Deregister an active scan */
+static void
+deregister_seq_scan(HTAB *hashp)
+{
+ int i;
+
+ /* Search backward since it's most likely at the stack top */
+ for (i = num_seq_scans - 1; i >= 0; i--)
+ {
+ if (seq_scan_tables[i] == hashp)
+ {
+ seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+ seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+ num_seq_scans--;
+ return;
+ }
+ }
+ elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
+ hashp->tabname);
+}
+
+/* Check if a table has any active scan */
+static bool
+has_seq_scans(HTAB *hashp)
+{
+ int i;
+
+ for (i = 0; i < num_seq_scans; i++)
+ {
+ if (seq_scan_tables[i] == hashp)
+ return true;
+ }
+ return false;
+}
+
+/* Clean up any open scans at end of transaction */
+void
+AtEOXact_HashTables(bool isCommit)
+{
+ /*
+ * During abort cleanup, open scans are expected; just silently clean 'em
+ * out. An open scan at commit means someone forgot a hash_seq_term()
+ * call, so complain.
+ *
+ * Note: it's tempting to try to print the tabname here, but refrain for
+ * fear of touching deallocated memory. This isn't a user-facing message
+ * anyway, so it needn't be pretty.
+ */
+ if (isCommit)
+ {
+ int i;
+
+ for (i = 0; i < num_seq_scans; i++)
+ {
+ elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+ seq_scan_tables[i]);
+ }
+ }
+ num_seq_scans = 0;
+}
+
+/* Clean up any open scans at end of subtransaction */
+void
+AtEOSubXact_HashTables(bool isCommit, int nestDepth)
+{
+ int i;
+
+ /*
+ * Search backward to make cleanup easy. Note we must check all entries,
+ * not only those at the end of the array, because deletion technique
+ * doesn't keep them in order.
+ */
+ for (i = num_seq_scans - 1; i >= 0; i--)
+ {
+ if (seq_scan_level[i] >= nestDepth)
+ {
+ if (isCommit)
+ elog(WARNING, "leaked hash_seq_search scan for hash table %p",
+ seq_scan_tables[i]);
+ seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
+ seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
+ num_seq_scans--;
+ }
+ }
+}