/*------------------------------------------------------------------------- * * dynahash.c * dynamic chained hash tables * * dynahash.c supports both local-to-a-backend hash tables and hash tables in * shared memory. For shared hash tables, it is the caller's responsibility * to provide appropriate access interlocking. The simplest convention is * that a single LWLock protects the whole hash table. Searches (HASH_FIND or * hash_seq_search) need only shared lock, but any update requires exclusive * lock. For heavily-used shared tables, the single-lock approach creates a * concurrency bottleneck, so we also support "partitioned" locking wherein * there are multiple LWLocks guarding distinct subsets of the table. To use * a hash table in partitioned mode, the HASH_PARTITION flag must be given * to hash_create. This prevents any attempt to split buckets on-the-fly. * Therefore, each hash bucket chain operates independently, and no fields * of the hash header change after init except nentries and freeList. * (A partitioned table uses multiple copies of those fields, guarded by * spinlocks, for additional concurrency.) * This lets any subset of the hash buckets be treated as a separately * lockable partition. We expect callers to use the low-order bits of a * lookup key's hash value as a partition number --- this will work because * of the way calc_bucket() maps hash values to bucket numbers. * * For hash tables in shared memory, the memory allocator function should * match malloc's semantics of returning NULL on failure. For hash tables * in local memory, we typically use palloc() which will throw error on * failure. The code in this file has to cope with both cases. * * dynahash.c provides support for these types of lookup keys: * * 1. Null-terminated C strings (truncated if necessary to fit in keysize), * compared as though by strcmp(). This is selected by specifying the * HASH_STRINGS flag to hash_create. * * 2. Arbitrary binary data of size keysize, compared as though by memcmp(). * (Caller must ensure there are no undefined padding bits in the keys!) * This is selected by specifying the HASH_BLOBS flag to hash_create. * * 3. More complex key behavior can be selected by specifying user-supplied * hashing, comparison, and/or key-copying functions. At least a hashing * function must be supplied; comparison defaults to memcmp() and key copying * to memcpy() when a user-defined hashing function is selected. * * Compared to simplehash, dynahash has the following benefits: * * - It supports partitioning, which is useful for shared memory access using * locks. * - Shared memory hashes are allocated in a fixed size area at startup and * are discoverable by name from other processes. * - Because entries don't need to be moved in the case of hash conflicts, * dynahash has better performance for large entries. * - Guarantees stable pointers to entries. * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/backend/utils/hash/dynahash.c * *------------------------------------------------------------------------- */ /* * Original comments: * * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson. * Coded into C, with minor code improvements, and with hsearch(3) interface, * by ejp@ausmelb.oz, Jul 26, 1988: 13:16; * also, hcreate/hdestroy routines added to simulate hsearch(3). * * These routines simulate hsearch(3) and family, with the important * difference that the hash table is dynamic - can grow indefinitely * beyond its original size (as supplied to hcreate()). * * Performance appears to be comparable to that of hsearch(3). * The 'source-code' options referred to in hsearch(3)'s 'man' page * are not implemented; otherwise functionality is identical. * * Compilation controls: * HASH_DEBUG controls some informative traces, mainly for debugging. * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained; * when combined with HASH_DEBUG, these are displayed by hdestroy(). * * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor * concatenation property, in probably unnecessary code 'optimization'. * * Modified margo@postgres.berkeley.edu February 1990 * added multiple table interface * Modified by sullivan@postgres.berkeley.edu April 1990 * changed ctl structure for shared memory */ #include "postgres.h" #include #include "access/xact.h" #include "common/hashfn.h" #include "port/pg_bitutils.h" #include "storage/shmem.h" #include "storage/spin.h" #include "utils/dynahash.h" #include "utils/memutils.h" /* * Constants * * A hash table has a top-level "directory", each of whose entries points * to a "segment" of ssize bucket headers. The maximum number of hash * buckets is thus dsize * ssize (but dsize may be expansible). Of course, * the number of records in the table can be larger, but we don't want a * whole lot of records per bucket or performance goes down. * * In a hash table allocated in shared memory, the directory cannot be * expanded because it must stay at a fixed address. The directory size * should be selected using hash_select_dirsize (and you'd better have * a good idea of the maximum number of entries!). For non-shared hash * tables, the initial directory size can be left at the default. */ #define DEF_SEGSIZE 256 #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */ #define DEF_DIRSIZE 256 /* Number of freelists to be used for a partitioned hash table. */ #define NUM_FREELISTS 32 /* A hash bucket is a linked list of HASHELEMENTs */ typedef HASHELEMENT *HASHBUCKET; /* A hash segment is an array of bucket headers */ typedef HASHBUCKET *HASHSEGMENT; /* * Per-freelist data. * * In a partitioned hash table, each freelist is associated with a specific * set of hashcodes, as determined by the FREELIST_IDX() macro below. * nentries tracks the number of live hashtable entries having those hashcodes * (NOT the number of entries in the freelist, as you might expect). * * The coverage of a freelist might be more or less than one partition, so it * needs its own lock rather than relying on caller locking. Relying on that * wouldn't work even if the coverage was the same, because of the occasional * need to "borrow" entries from another freelist; see get_hash_entry(). * * Using an array of FreeListData instead of separate arrays of mutexes, * nentries and freeLists helps to reduce sharing of cache lines between * different mutexes. */ typedef struct { slock_t mutex; /* spinlock for this freelist */ long nentries; /* number of entries in associated buckets */ HASHELEMENT *freeList; /* chain of free elements */ } FreeListData; /* * Header structure for a hash table --- contains all changeable info * * In a shared-memory hash table, the HASHHDR is in shared memory, while * each backend has a local HTAB struct. For a non-shared table, there isn't * any functional difference between HASHHDR and HTAB, but we separate them * anyway to share code between shared and non-shared tables. */ struct HASHHDR { /* * The freelist can become a point of contention in high-concurrency hash * tables, so we use an array of freelists, each with its own mutex and * nentries count, instead of just a single one. Although the freelists * normally operate independently, we will scavenge entries from freelists * other than a hashcode's default freelist when necessary. * * If the hash table is not partitioned, only freeList[0] is used and its * spinlock is not used at all; callers' locking is assumed sufficient. */ FreeListData freeList[NUM_FREELISTS]; /* These fields can change, but not in a partitioned table */ /* Also, dsize can't change in a shared table, even if unpartitioned */ long dsize; /* directory size */ long nsegs; /* number of allocated segments (<= dsize) */ uint32 max_bucket; /* ID of maximum bucket in use */ uint32 high_mask; /* mask to modulo into entire table */ uint32 low_mask; /* mask to modulo into lower half of table */ /* These fields are fixed at hashtable creation */ Size keysize; /* hash key length in bytes */ Size entrysize; /* total user element size in bytes */ long num_partitions; /* # partitions (must be power of 2), or 0 */ long max_dsize; /* 'dsize' limit if directory is fixed size */ long ssize; /* segment size --- must be power of 2 */ int sshift; /* segment shift = log2(ssize) */ int nelem_alloc; /* number of entries to allocate at once */ #ifdef HASH_STATISTICS /* * Count statistics here. NB: stats code doesn't bother with mutex, so * counts could be corrupted a bit in a partitioned table. */ long accesses; long collisions; #endif }; #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) #define FREELIST_IDX(hctl, hashcode) \ (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0) /* * Top control structure for a hashtable --- in a shared table, each backend * has its own copy (OK since no fields change at runtime) */ struct HTAB { HASHHDR *hctl; /* => shared control information */ HASHSEGMENT *dir; /* directory of segment starts */ HashValueFunc hash; /* hash function */ HashCompareFunc match; /* key comparison function */ HashCopyFunc keycopy; /* key copying function */ HashAllocFunc alloc; /* memory allocator */ MemoryContext hcxt; /* memory context if default allocator used */ char *tabname; /* table name (for error messages) */ bool isshared; /* true if table is in shared memory */ bool isfixed; /* if true, don't enlarge */ /* freezing a shared table isn't allowed, so we can keep state here */ bool frozen; /* true = no more inserts allowed */ /* We keep local copies of these fixed values to reduce contention */ Size keysize; /* hash key length in bytes */ long ssize; /* segment size --- must be power of 2 */ int sshift; /* segment shift = log2(ssize) */ }; /* * Key (also entry) part of a HASHELEMENT */ #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT))) /* * Obtain element pointer given pointer to key */ #define ELEMENT_FROM_KEY(key) \ ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT)))) /* * Fast MOD arithmetic, assuming that y is a power of 2 ! */ #define MOD(x,y) ((x) & ((y)-1)) #ifdef HASH_STATISTICS static long hash_accesses, hash_collisions, hash_expansions; #endif /* * Private function prototypes */ static void *DynaHashAlloc(Size size); static HASHSEGMENT seg_alloc(HTAB *hashp); static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx); static bool dir_realloc(HTAB *hashp); static bool expand_table(HTAB *hashp); static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx); static void hdefault(HTAB *hashp); static int choose_nelem_alloc(Size entrysize); static bool init_htab(HTAB *hashp, long nelem); static void hash_corrupted(HTAB *hashp); static long next_pow2_long(long num); static int next_pow2_int(long num); static void register_seq_scan(HTAB *hashp); static void deregister_seq_scan(HTAB *hashp); static bool has_seq_scans(HTAB *hashp); /* * memory allocation support */ static MemoryContext CurrentDynaHashCxt = NULL; static void * DynaHashAlloc(Size size) { Assert(MemoryContextIsValid(CurrentDynaHashCxt)); return MemoryContextAllocExtended(CurrentDynaHashCxt, size, MCXT_ALLOC_NO_OOM); } /* * HashCompareFunc for string keys * * Because we copy keys with strlcpy(), they will be truncated at keysize-1 * bytes, so we can only compare that many ... hence strncmp is almost but * not quite the right thing. */ static int string_compare(const char *key1, const char *key2, Size keysize) { return strncmp(key1, key2, keysize - 1); } /************************** CREATE ROUTINES **********************/ /* * hash_create -- create a new dynamic hash table * * tabname: a name for the table (for debugging purposes) * nelem: maximum number of elements expected * *info: additional table parameters, as indicated by flags * flags: bitmask indicating which parameters to take from *info * * The flags value *must* include HASH_ELEM. (Formerly, this was nominally * optional, but the default keysize and entrysize values were useless.) * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS, * or HASH_FUNCTION, to define the key hashing semantics (C strings, * binary blobs, or custom, respectively). Callers specifying a custom * hash function will likely also want to use HASH_COMPARE, and perhaps * also HASH_KEYCOPY, to control key comparison and copying. * Another often-used flag is HASH_CONTEXT, to allocate the hash table * under info->hcxt rather than under TopMemoryContext; the default * behavior is only suitable for session-lifespan hash tables. * Other flags bits are special-purpose and seldom used, except for those * associated with shared-memory hash tables, for which see ShmemInitHash(). * * Fields in *info are read only when the associated flags bit is set. * It is not necessary to initialize other fields of *info. * Neither tabname nor *info need persist after the hash_create() call. * * Note: It is deprecated for callers of hash_create() to explicitly specify * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than * one of these. * * Note: for a shared-memory hashtable, nelem needs to be a pretty good * estimate, since we can't expand the table on the fly. But an unshared * hashtable can be expanded on-the-fly, so it's better for nelem to be * on the small side and let the table grow if it's exceeded. An overly * large nelem will penalize hash_seq_search speed without buying much. */ HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags) { HTAB *hashp; HASHHDR *hctl; /* * Hash tables now allocate space for key and data, but you have to say * how much space to allocate. */ Assert(flags & HASH_ELEM); Assert(info->keysize > 0); Assert(info->entrysize >= info->keysize); /* * For shared hash tables, we have a local hash header (HTAB struct) that * we allocate in TopMemoryContext; all else is in shared memory. * * For non-shared hash tables, everything including the hash header is in * a memory context created specially for the hash table --- this makes * hash_destroy very simple. The memory context is made a child of either * a context specified by the caller, or TopMemoryContext if nothing is * specified. */ if (flags & HASH_SHARED_MEM) { /* Set up to allocate the hash header */ CurrentDynaHashCxt = TopMemoryContext; } else { /* Create the hash table's private memory context */ if (flags & HASH_CONTEXT) CurrentDynaHashCxt = info->hcxt; else CurrentDynaHashCxt = TopMemoryContext; CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt, "dynahash", ALLOCSET_DEFAULT_SIZES); } /* Initialize the hash header, plus a copy of the table name */ hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1); MemSet(hashp, 0, sizeof(HTAB)); hashp->tabname = (char *) (hashp + 1); strcpy(hashp->tabname, tabname); /* If we have a private context, label it with hashtable's name */ if (!(flags & HASH_SHARED_MEM)) MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname); /* * Select the appropriate hash function (see comments at head of file). */ if (flags & HASH_FUNCTION) { Assert(!(flags & (HASH_BLOBS | HASH_STRINGS))); hashp->hash = info->hash; } else if (flags & HASH_BLOBS) { Assert(!(flags & HASH_STRINGS)); /* We can optimize hashing for common key sizes */ if (info->keysize == sizeof(uint32)) hashp->hash = uint32_hash; else hashp->hash = tag_hash; } else { /* * string_hash used to be considered the default hash method, and in a * non-assert build it effectively still is. But we now consider it * an assertion error to not say HASH_STRINGS explicitly. To help * catch mistaken usage of HASH_STRINGS, we also insist on a * reasonably long string length: if the keysize is only 4 or 8 bytes, * it's almost certainly an integer or pointer not a string. */ Assert(flags & HASH_STRINGS); Assert(info->keysize > 8); hashp->hash = string_hash; } /* * If you don't specify a match function, it defaults to string_compare if * you used string_hash, and to memcmp otherwise. * * Note: explicitly specifying string_hash is deprecated, because this * might not work for callers in loadable modules on some platforms due to * referencing a trampoline instead of the string_hash function proper. * Specify HASH_STRINGS instead. */ if (flags & HASH_COMPARE) hashp->match = info->match; else if (hashp->hash == string_hash) hashp->match = (HashCompareFunc) string_compare; else hashp->match = memcmp; /* * Similarly, the key-copying function defaults to strlcpy or memcpy. */ if (flags & HASH_KEYCOPY) hashp->keycopy = info->keycopy; else if (hashp->hash == string_hash) { /* * The signature of keycopy is meant for memcpy(), which returns * void*, but strlcpy() returns size_t. Since we never use the return * value of keycopy, and size_t is pretty much always the same size as * void *, this should be safe. The extra cast in the middle is to * avoid warnings from -Wcast-function-type. */ hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy; } else hashp->keycopy = memcpy; /* And select the entry allocation function, too. */ if (flags & HASH_ALLOC) hashp->alloc = info->alloc; else hashp->alloc = DynaHashAlloc; if (flags & HASH_SHARED_MEM) { /* * ctl structure and directory are preallocated for shared memory * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as * well. */ hashp->hctl = info->hctl; hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR)); hashp->hcxt = NULL; hashp->isshared = true; /* hash table already exists, we're just attaching to it */ if (flags & HASH_ATTACH) { /* make local copies of some heavily-used values */ hctl = hashp->hctl; hashp->keysize = hctl->keysize; hashp->ssize = hctl->ssize; hashp->sshift = hctl->sshift; return hashp; } } else { /* setup hash table defaults */ hashp->hctl = NULL; hashp->dir = NULL; hashp->hcxt = CurrentDynaHashCxt; hashp->isshared = false; } if (!hashp->hctl) { hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR)); if (!hashp->hctl) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } hashp->frozen = false; hdefault(hashp); hctl = hashp->hctl; if (flags & HASH_PARTITION) { /* Doesn't make sense to partition a local hash table */ Assert(flags & HASH_SHARED_MEM); /* * The number of partitions had better be a power of 2. Also, it must * be less than INT_MAX (see init_htab()), so call the int version of * next_pow2. */ Assert(info->num_partitions == next_pow2_int(info->num_partitions)); hctl->num_partitions = info->num_partitions; } if (flags & HASH_SEGMENT) { hctl->ssize = info->ssize; hctl->sshift = my_log2(info->ssize); /* ssize had better be a power of 2 */ Assert(hctl->ssize == (1L << hctl->sshift)); } /* * SHM hash tables have fixed directory size passed by the caller. */ if (flags & HASH_DIRSIZE) { hctl->max_dsize = info->max_dsize; hctl->dsize = info->dsize; } /* remember the entry sizes, too */ hctl->keysize = info->keysize; hctl->entrysize = info->entrysize; /* make local copies of heavily-used constant fields */ hashp->keysize = hctl->keysize; hashp->ssize = hctl->ssize; hashp->sshift = hctl->sshift; /* Build the hash directory structure */ if (!init_htab(hashp, nelem)) elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname); /* * For a shared hash table, preallocate the requested number of elements. * This reduces problems with run-time out-of-shared-memory conditions. * * For a non-shared hash table, preallocate the requested number of * elements if it's less than our chosen nelem_alloc. This avoids wasting * space if the caller correctly estimates a small table size. */ if ((flags & HASH_SHARED_MEM) || nelem < hctl->nelem_alloc) { int i, freelist_partitions, nelem_alloc, nelem_alloc_first; /* * If hash table is partitioned, give each freelist an equal share of * the initial allocation. Otherwise only freeList[0] is used. */ if (IS_PARTITIONED(hashp->hctl)) freelist_partitions = NUM_FREELISTS; else freelist_partitions = 1; nelem_alloc = nelem / freelist_partitions; if (nelem_alloc <= 0) nelem_alloc = 1; /* * Make sure we'll allocate all the requested elements; freeList[0] * gets the excess if the request isn't divisible by NUM_FREELISTS. */ if (nelem_alloc * freelist_partitions < nelem) nelem_alloc_first = nelem - nelem_alloc * (freelist_partitions - 1); else nelem_alloc_first = nelem_alloc; for (i = 0; i < freelist_partitions; i++) { int temp = (i == 0) ? nelem_alloc_first : nelem_alloc; if (!element_alloc(hashp, temp, i)) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } } if (flags & HASH_FIXED_SIZE) hashp->isfixed = true; return hashp; } /* * Set default HASHHDR parameters. */ static void hdefault(HTAB *hashp) { HASHHDR *hctl = hashp->hctl; MemSet(hctl, 0, sizeof(HASHHDR)); hctl->dsize = DEF_DIRSIZE; hctl->nsegs = 0; hctl->num_partitions = 0; /* not partitioned */ /* table has no fixed maximum size */ hctl->max_dsize = NO_MAX_DSIZE; hctl->ssize = DEF_SEGSIZE; hctl->sshift = DEF_SEGSIZE_SHIFT; #ifdef HASH_STATISTICS hctl->accesses = hctl->collisions = 0; #endif } /* * Given the user-specified entry size, choose nelem_alloc, ie, how many * elements to add to the hash table when we need more. */ static int choose_nelem_alloc(Size entrysize) { int nelem_alloc; Size elementSize; Size allocSize; /* Each element has a HASHELEMENT header plus user data. */ /* NB: this had better match element_alloc() */ elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize); /* * The idea here is to choose nelem_alloc at least 32, but round up so * that the allocation request will be a power of 2 or just less. This * makes little difference for hash tables in shared memory, but for hash * tables managed by palloc, the allocation request will be rounded up to * a power of 2 anyway. If we fail to take this into account, we'll waste * as much as half the allocated space. */ allocSize = 32 * 4; /* assume elementSize at least 8 */ do { allocSize <<= 1; nelem_alloc = allocSize / elementSize; } while (nelem_alloc < 32); return nelem_alloc; } /* * Compute derived fields of hctl and build the initial directory/segment * arrays */ static bool init_htab(HTAB *hashp, long nelem) { HASHHDR *hctl = hashp->hctl; HASHSEGMENT *segp; int nbuckets; int nsegs; int i; /* * initialize mutexes if it's a partitioned table */ if (IS_PARTITIONED(hctl)) for (i = 0; i < NUM_FREELISTS; i++) SpinLockInit(&(hctl->freeList[i].mutex)); /* * Allocate space for the next greater power of two number of buckets, * assuming a desired maximum load factor of 1. */ nbuckets = next_pow2_int(nelem); /* * In a partitioned table, nbuckets must be at least equal to * num_partitions; were it less, keys with apparently different partition * numbers would map to the same bucket, breaking partition independence. * (Normally nbuckets will be much bigger; this is just a safety check.) */ while (nbuckets < hctl->num_partitions) nbuckets <<= 1; hctl->max_bucket = hctl->low_mask = nbuckets - 1; hctl->high_mask = (nbuckets << 1) - 1; /* * Figure number of directory segments needed, round up to a power of 2 */ nsegs = (nbuckets - 1) / hctl->ssize + 1; nsegs = next_pow2_int(nsegs); /* * Make sure directory is big enough. If pre-allocated directory is too * small, choke (caller screwed up). */ if (nsegs > hctl->dsize) { if (!(hashp->dir)) hctl->dsize = nsegs; else return false; } /* Allocate a directory */ if (!(hashp->dir)) { CurrentDynaHashCxt = hashp->hcxt; hashp->dir = (HASHSEGMENT *) hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT)); if (!hashp->dir) return false; } /* Allocate initial segments */ for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++) { *segp = seg_alloc(hashp); if (*segp == NULL) return false; } /* Choose number of entries to allocate at a time */ hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize); #ifdef HASH_DEBUG fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n", "TABLE POINTER ", hashp, "DIRECTORY SIZE ", hctl->dsize, "SEGMENT SIZE ", hctl->ssize, "SEGMENT SHIFT ", hctl->sshift, "MAX BUCKET ", hctl->max_bucket, "HIGH MASK ", hctl->high_mask, "LOW MASK ", hctl->low_mask, "NSEGS ", hctl->nsegs); #endif return true; } /* * Estimate the space needed for a hashtable containing the given number * of entries of given size. * NOTE: this is used to estimate the footprint of hashtables in shared * memory; therefore it does not count HTAB which is in local memory. * NB: assumes that all hash structure parameters have default values! */ Size hash_estimate_size(long num_entries, Size entrysize) { Size size; long nBuckets, nSegments, nDirEntries, nElementAllocs, elementSize, elementAllocCnt; /* estimate number of buckets wanted */ nBuckets = next_pow2_long(num_entries); /* # of segments needed for nBuckets */ nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1); /* directory entries */ nDirEntries = DEF_DIRSIZE; while (nDirEntries < nSegments) nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */ /* fixed control info */ size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */ /* directory */ size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT))); /* segments */ size = add_size(size, mul_size(nSegments, MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET)))); /* elements --- allocated in groups of choose_nelem_alloc() entries */ elementAllocCnt = choose_nelem_alloc(entrysize); nElementAllocs = (num_entries - 1) / elementAllocCnt + 1; elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize); size = add_size(size, mul_size(nElementAllocs, mul_size(elementAllocCnt, elementSize))); return size; } /* * Select an appropriate directory size for a hashtable with the given * maximum number of entries. * This is only needed for hashtables in shared memory, whose directories * cannot be expanded dynamically. * NB: assumes that all hash structure parameters have default values! * * XXX this had better agree with the behavior of init_htab()... */ long hash_select_dirsize(long num_entries) { long nBuckets, nSegments, nDirEntries; /* estimate number of buckets wanted */ nBuckets = next_pow2_long(num_entries); /* # of segments needed for nBuckets */ nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1); /* directory entries */ nDirEntries = DEF_DIRSIZE; while (nDirEntries < nSegments) nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */ return nDirEntries; } /* * Compute the required initial memory allocation for a shared-memory * hashtable with the given parameters. We need space for the HASHHDR * and for the (non expansible) directory. */ Size hash_get_shared_size(HASHCTL *info, int flags) { Assert(flags & HASH_DIRSIZE); Assert(info->dsize == info->max_dsize); return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT); } /********************** DESTROY ROUTINES ************************/ void hash_destroy(HTAB *hashp) { if (hashp != NULL) { /* allocation method must be one we know how to free, too */ Assert(hashp->alloc == DynaHashAlloc); /* so this hashtable must have its own context */ Assert(hashp->hcxt != NULL); hash_stats("destroy", hashp); /* * Free everything by destroying the hash table's memory context. */ MemoryContextDelete(hashp->hcxt); } } void hash_stats(const char *where, HTAB *hashp) { #ifdef HASH_STATISTICS fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n", where, hashp->hctl->accesses, hashp->hctl->collisions); fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", hash_get_num_entries(hashp), (long) hashp->hctl->keysize, hashp->hctl->max_bucket, hashp->hctl->nsegs); fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", where, hash_accesses, hash_collisions); fprintf(stderr, "hash_stats: total expansions %ld\n", hash_expansions); #endif } /*******************************SEARCH ROUTINES *****************************/ /* * get_hash_value -- exported routine to calculate a key's hash value * * We export this because for partitioned tables, callers need to compute * the partition number (from the low-order bits of the hash value) before * searching. */ uint32 get_hash_value(HTAB *hashp, const void *keyPtr) { return hashp->hash(keyPtr, hashp->keysize); } /* Convert a hash value to a bucket number */ static inline uint32 calc_bucket(HASHHDR *hctl, uint32 hash_val) { uint32 bucket; bucket = hash_val & hctl->high_mask; if (bucket > hctl->max_bucket) bucket = bucket & hctl->low_mask; return bucket; } /* * hash_search -- look up key in table and perform action * hash_search_with_hash_value -- same, with key's hash value already computed * * action is one of: * HASH_FIND: look up key in table * HASH_ENTER: look up key in table, creating entry if not present * HASH_ENTER_NULL: same, but return NULL if out of memory * HASH_REMOVE: look up key in table, remove entry if present * * Return value is a pointer to the element found/entered/removed if any, * or NULL if no match was found. (NB: in the case of the REMOVE action, * the result is a dangling pointer that shouldn't be dereferenced!) * * HASH_ENTER will normally ereport a generic "out of memory" error if * it is unable to create a new entry. The HASH_ENTER_NULL operation is * the same except it will return NULL if out of memory. * * If foundPtr isn't NULL, then *foundPtr is set true if we found an * existing entry in the table, false otherwise. This is needed in the * HASH_ENTER case, but is redundant with the return value otherwise. * * For hash_search_with_hash_value, the hashvalue parameter must have been * calculated with get_hash_value(). */ void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr) { return hash_search_with_hash_value(hashp, keyPtr, hashp->hash(keyPtr, hashp->keysize), action, foundPtr); } void * hash_search_with_hash_value(HTAB *hashp, const void *keyPtr, uint32 hashvalue, HASHACTION action, bool *foundPtr) { HASHHDR *hctl = hashp->hctl; int freelist_idx = FREELIST_IDX(hctl, hashvalue); Size keysize; uint32 bucket; long segment_num; long segment_ndx; HASHSEGMENT segp; HASHBUCKET currBucket; HASHBUCKET *prevBucketPtr; HashCompareFunc match; #ifdef HASH_STATISTICS hash_accesses++; hctl->accesses++; #endif /* * If inserting, check if it is time to split a bucket. * * NOTE: failure to expand table is not a fatal error, it just means we * have to run at higher fill factor than we wanted. However, if we're * using the palloc allocator then it will throw error anyway on * out-of-memory, so we must do this before modifying the table. */ if (action == HASH_ENTER || action == HASH_ENTER_NULL) { /* * Can't split if running in partitioned mode, nor if frozen, nor if * table is the subject of any active hash_seq_search scans. */ if (hctl->freeList[0].nentries > (long) hctl->max_bucket && !IS_PARTITIONED(hctl) && !hashp->frozen && !has_seq_scans(hashp)) (void) expand_table(hashp); } /* * Do the initial lookup */ bucket = calc_bucket(hctl, hashvalue); segment_num = bucket >> hashp->sshift; segment_ndx = MOD(bucket, hashp->ssize); segp = hashp->dir[segment_num]; if (segp == NULL) hash_corrupted(hashp); prevBucketPtr = &segp[segment_ndx]; currBucket = *prevBucketPtr; /* * Follow collision chain looking for matching key */ match = hashp->match; /* save one fetch in inner loop */ keysize = hashp->keysize; /* ditto */ while (currBucket != NULL) { if (currBucket->hashvalue == hashvalue && match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0) break; prevBucketPtr = &(currBucket->link); currBucket = *prevBucketPtr; #ifdef HASH_STATISTICS hash_collisions++; hctl->collisions++; #endif } if (foundPtr) *foundPtr = (bool) (currBucket != NULL); /* * OK, now what? */ switch (action) { case HASH_FIND: if (currBucket != NULL) return (void *) ELEMENTKEY(currBucket); return NULL; case HASH_REMOVE: if (currBucket != NULL) { /* if partitioned, must lock to touch nentries and freeList */ if (IS_PARTITIONED(hctl)) SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex)); /* delete the record from the appropriate nentries counter. */ Assert(hctl->freeList[freelist_idx].nentries > 0); hctl->freeList[freelist_idx].nentries--; /* remove record from hash bucket's chain. */ *prevBucketPtr = currBucket->link; /* add the record to the appropriate freelist. */ currBucket->link = hctl->freeList[freelist_idx].freeList; hctl->freeList[freelist_idx].freeList = currBucket; if (IS_PARTITIONED(hctl)) SpinLockRelease(&hctl->freeList[freelist_idx].mutex); /* * better hope the caller is synchronizing access to this * element, because someone else is going to reuse it the next * time something is added to the table */ return (void *) ELEMENTKEY(currBucket); } return NULL; case HASH_ENTER: case HASH_ENTER_NULL: /* Return existing element if found, else create one */ if (currBucket != NULL) return (void *) ELEMENTKEY(currBucket); /* disallow inserts if frozen */ if (hashp->frozen) elog(ERROR, "cannot insert into frozen hashtable \"%s\"", hashp->tabname); currBucket = get_hash_entry(hashp, freelist_idx); if (currBucket == NULL) { /* out of memory */ if (action == HASH_ENTER_NULL) return NULL; /* report a generic message */ if (hashp->isshared) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of shared memory"))); else ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } /* link into hashbucket chain */ *prevBucketPtr = currBucket; currBucket->link = NULL; /* copy key into record */ currBucket->hashvalue = hashvalue; hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize); /* * Caller is expected to fill the data field on return. DO NOT * insert any code that could possibly throw error here, as doing * so would leave the table entry incomplete and hence corrupt the * caller's data structure. */ return (void *) ELEMENTKEY(currBucket); } elog(ERROR, "unrecognized hash action code: %d", (int) action); return NULL; /* keep compiler quiet */ } /* * hash_update_hash_key -- change the hash key of an existing table entry * * This is equivalent to removing the entry, making a new entry, and copying * over its data, except that the entry never goes to the table's freelist. * Therefore this cannot suffer an out-of-memory failure, even if there are * other processes operating in other partitions of the hashtable. * * Returns true if successful, false if the requested new hash key is already * present. Throws error if the specified entry pointer isn't actually a * table member. * * NB: currently, there is no special case for old and new hash keys being * identical, which means we'll report false for that situation. This is * preferable for existing uses. * * NB: for a partitioned hashtable, caller must hold lock on both relevant * partitions, if the new hash key would belong to a different partition. */ bool hash_update_hash_key(HTAB *hashp, void *existingEntry, const void *newKeyPtr) { HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry); HASHHDR *hctl = hashp->hctl; uint32 newhashvalue; Size keysize; uint32 bucket; uint32 newbucket; long segment_num; long segment_ndx; HASHSEGMENT segp; HASHBUCKET currBucket; HASHBUCKET *prevBucketPtr; HASHBUCKET *oldPrevPtr; HashCompareFunc match; #ifdef HASH_STATISTICS hash_accesses++; hctl->accesses++; #endif /* disallow updates if frozen */ if (hashp->frozen) elog(ERROR, "cannot update in frozen hashtable \"%s\"", hashp->tabname); /* * Lookup the existing element using its saved hash value. We need to do * this to be able to unlink it from its hash chain, but as a side benefit * we can verify the validity of the passed existingEntry pointer. */ bucket = calc_bucket(hctl, existingElement->hashvalue); segment_num = bucket >> hashp->sshift; segment_ndx = MOD(bucket, hashp->ssize); segp = hashp->dir[segment_num]; if (segp == NULL) hash_corrupted(hashp); prevBucketPtr = &segp[segment_ndx]; currBucket = *prevBucketPtr; while (currBucket != NULL) { if (currBucket == existingElement) break; prevBucketPtr = &(currBucket->link); currBucket = *prevBucketPtr; } if (currBucket == NULL) elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"", hashp->tabname); oldPrevPtr = prevBucketPtr; /* * Now perform the equivalent of a HASH_ENTER operation to locate the hash * chain we want to put the entry into. */ newhashvalue = hashp->hash(newKeyPtr, hashp->keysize); newbucket = calc_bucket(hctl, newhashvalue); segment_num = newbucket >> hashp->sshift; segment_ndx = MOD(newbucket, hashp->ssize); segp = hashp->dir[segment_num]; if (segp == NULL) hash_corrupted(hashp); prevBucketPtr = &segp[segment_ndx]; currBucket = *prevBucketPtr; /* * Follow collision chain looking for matching key */ match = hashp->match; /* save one fetch in inner loop */ keysize = hashp->keysize; /* ditto */ while (currBucket != NULL) { if (currBucket->hashvalue == newhashvalue && match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0) break; prevBucketPtr = &(currBucket->link); currBucket = *prevBucketPtr; #ifdef HASH_STATISTICS hash_collisions++; hctl->collisions++; #endif } if (currBucket != NULL) return false; /* collision with an existing entry */ currBucket = existingElement; /* * If old and new hash values belong to the same bucket, we need not * change any chain links, and indeed should not since this simplistic * update will corrupt the list if currBucket is the last element. (We * cannot fall out earlier, however, since we need to scan the bucket to * check for duplicate keys.) */ if (bucket != newbucket) { /* OK to remove record from old hash bucket's chain. */ *oldPrevPtr = currBucket->link; /* link into new hashbucket chain */ *prevBucketPtr = currBucket; currBucket->link = NULL; } /* copy new key into record */ currBucket->hashvalue = newhashvalue; hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize); /* rest of record is untouched */ return true; } /* * Allocate a new hashtable entry if possible; return NULL if out of memory. * (Or, if the underlying space allocator throws error for out-of-memory, * we won't return at all.) */ static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx) { HASHHDR *hctl = hashp->hctl; HASHBUCKET newElement; for (;;) { /* if partitioned, must lock to touch nentries and freeList */ if (IS_PARTITIONED(hctl)) SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); /* try to get an entry from the freelist */ newElement = hctl->freeList[freelist_idx].freeList; if (newElement != NULL) break; if (IS_PARTITIONED(hctl)) SpinLockRelease(&hctl->freeList[freelist_idx].mutex); /* * No free elements in this freelist. In a partitioned table, there * might be entries in other freelists, but to reduce contention we * prefer to first try to get another chunk of buckets from the main * shmem allocator. If that fails, though, we *MUST* root through all * the other freelists before giving up. There are multiple callers * that assume that they can allocate every element in the initially * requested table size, or that deleting an element guarantees they * can insert a new element, even if shared memory is entirely full. * Failing because the needed element is in a different freelist is * not acceptable. */ if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx)) { int borrow_from_idx; if (!IS_PARTITIONED(hctl)) return NULL; /* out of memory */ /* try to borrow element from another freelist */ borrow_from_idx = freelist_idx; for (;;) { borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS; if (borrow_from_idx == freelist_idx) break; /* examined all freelists, fail */ SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex)); newElement = hctl->freeList[borrow_from_idx].freeList; if (newElement != NULL) { hctl->freeList[borrow_from_idx].freeList = newElement->link; SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); /* careful: count the new element in its proper freelist */ SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); hctl->freeList[freelist_idx].nentries++; SpinLockRelease(&hctl->freeList[freelist_idx].mutex); return newElement; } SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); } /* no elements available to borrow either, so out of memory */ return NULL; } } /* remove entry from freelist, bump nentries */ hctl->freeList[freelist_idx].freeList = newElement->link; hctl->freeList[freelist_idx].nentries++; if (IS_PARTITIONED(hctl)) SpinLockRelease(&hctl->freeList[freelist_idx].mutex); return newElement; } /* * hash_get_num_entries -- get the number of entries in a hashtable */ long hash_get_num_entries(HTAB *hashp) { int i; long sum = hashp->hctl->freeList[0].nentries; /* * We currently don't bother with acquiring the mutexes; it's only * sensible to call this function if you've got lock on all partitions of * the table. */ if (IS_PARTITIONED(hashp->hctl)) { for (i = 1; i < NUM_FREELISTS; i++) sum += hashp->hctl->freeList[i].nentries; } return sum; } /* * hash_seq_init/_search/_term * Sequentially search through hash table and return * all the elements one by one, return NULL when no more. * * hash_seq_term should be called if and only if the scan is abandoned before * completion; if hash_seq_search returns NULL then it has already done the * end-of-scan cleanup. * * NOTE: caller may delete the returned element before continuing the scan. * However, deleting any other element while the scan is in progress is * UNDEFINED (it might be the one that curIndex is pointing at!). Also, * if elements are added to the table while the scan is in progress, it is * unspecified whether they will be visited by the scan or not. * * NOTE: it is possible to use hash_seq_init/hash_seq_search without any * worry about hash_seq_term cleanup, if the hashtable is first locked against * further insertions by calling hash_freeze. * * NOTE: to use this with a partitioned hashtable, caller had better hold * at least shared lock on all partitions of the table throughout the scan! * We can cope with insertions or deletions by our own backend, but *not* * with concurrent insertions or deletions by another. */ void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp) { status->hashp = hashp; status->curBucket = 0; status->curEntry = NULL; if (!hashp->frozen) register_seq_scan(hashp); } void * hash_seq_search(HASH_SEQ_STATUS *status) { HTAB *hashp; HASHHDR *hctl; uint32 max_bucket; long ssize; long segment_num; long segment_ndx; HASHSEGMENT segp; uint32 curBucket; HASHELEMENT *curElem; if ((curElem = status->curEntry) != NULL) { /* Continuing scan of curBucket... */ status->curEntry = curElem->link; if (status->curEntry == NULL) /* end of this bucket */ ++status->curBucket; return (void *) ELEMENTKEY(curElem); } /* * Search for next nonempty bucket starting at curBucket. */ curBucket = status->curBucket; hashp = status->hashp; hctl = hashp->hctl; ssize = hashp->ssize; max_bucket = hctl->max_bucket; if (curBucket > max_bucket) { hash_seq_term(status); return NULL; /* search is done */ } /* * first find the right segment in the table directory. */ segment_num = curBucket >> hashp->sshift; segment_ndx = MOD(curBucket, ssize); segp = hashp->dir[segment_num]; /* * Pick up the first item in this bucket's chain. If chain is not empty * we can begin searching it. Otherwise we have to advance to find the * next nonempty bucket. We try to optimize that case since searching a * near-empty hashtable has to iterate this loop a lot. */ while ((curElem = segp[segment_ndx]) == NULL) { /* empty bucket, advance to next */ if (++curBucket > max_bucket) { status->curBucket = curBucket; hash_seq_term(status); return NULL; /* search is done */ } if (++segment_ndx >= ssize) { segment_num++; segment_ndx = 0; segp = hashp->dir[segment_num]; } } /* Begin scan of curBucket... */ status->curEntry = curElem->link; if (status->curEntry == NULL) /* end of this bucket */ ++curBucket; status->curBucket = curBucket; return (void *) ELEMENTKEY(curElem); } void hash_seq_term(HASH_SEQ_STATUS *status) { if (!status->hashp->frozen) deregister_seq_scan(status->hashp); } /* * hash_freeze * Freeze a hashtable against future insertions (deletions are * still allowed) * * The reason for doing this is that by preventing any more bucket splits, * we no longer need to worry about registering hash_seq_search scans, * and thus caller need not be careful about ensuring hash_seq_term gets * called at the right times. * * Multiple calls to hash_freeze() are allowed, but you can't freeze a table * with active scans (since hash_seq_term would then do the wrong thing). */ void hash_freeze(HTAB *hashp) { if (hashp->isshared) elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname); if (!hashp->frozen && has_seq_scans(hashp)) elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans", hashp->tabname); hashp->frozen = true; } /********************************* UTILITIES ************************/ /* * Expand the table by adding one more hash bucket. */ static bool expand_table(HTAB *hashp) { HASHHDR *hctl = hashp->hctl; HASHSEGMENT old_seg, new_seg; long old_bucket, new_bucket; long new_segnum, new_segndx; long old_segnum, old_segndx; HASHBUCKET *oldlink, *newlink; HASHBUCKET currElement, nextElement; Assert(!IS_PARTITIONED(hctl)); #ifdef HASH_STATISTICS hash_expansions++; #endif new_bucket = hctl->max_bucket + 1; new_segnum = new_bucket >> hashp->sshift; new_segndx = MOD(new_bucket, hashp->ssize); if (new_segnum >= hctl->nsegs) { /* Allocate new segment if necessary -- could fail if dir full */ if (new_segnum >= hctl->dsize) if (!dir_realloc(hashp)) return false; if (!(hashp->dir[new_segnum] = seg_alloc(hashp))) return false; hctl->nsegs++; } /* OK, we created a new bucket */ hctl->max_bucket++; /* * *Before* changing masks, find old bucket corresponding to same hash * values; values in that bucket may need to be relocated to new bucket. * Note that new_bucket is certainly larger than low_mask at this point, * so we can skip the first step of the regular hash mask calc. */ old_bucket = (new_bucket & hctl->low_mask); /* * If we crossed a power of 2, readjust masks. */ if ((uint32) new_bucket > hctl->high_mask) { hctl->low_mask = hctl->high_mask; hctl->high_mask = (uint32) new_bucket | hctl->low_mask; } /* * Relocate records to the new bucket. NOTE: because of the way the hash * masking is done in calc_bucket, only one old bucket can need to be * split at this point. With a different way of reducing the hash value, * that might not be true! */ old_segnum = old_bucket >> hashp->sshift; old_segndx = MOD(old_bucket, hashp->ssize); old_seg = hashp->dir[old_segnum]; new_seg = hashp->dir[new_segnum]; oldlink = &old_seg[old_segndx]; newlink = &new_seg[new_segndx]; for (currElement = *oldlink; currElement != NULL; currElement = nextElement) { nextElement = currElement->link; if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket) { *oldlink = currElement; oldlink = &currElement->link; } else { *newlink = currElement; newlink = &currElement->link; } } /* don't forget to terminate the rebuilt hash chains... */ *oldlink = NULL; *newlink = NULL; return true; } static bool dir_realloc(HTAB *hashp) { HASHSEGMENT *p; HASHSEGMENT *old_p; long new_dsize; long old_dirsize; long new_dirsize; if (hashp->hctl->max_dsize != NO_MAX_DSIZE) return false; /* Reallocate directory */ new_dsize = hashp->hctl->dsize << 1; old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT); new_dirsize = new_dsize * sizeof(HASHSEGMENT); old_p = hashp->dir; CurrentDynaHashCxt = hashp->hcxt; p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize); if (p != NULL) { memcpy(p, old_p, old_dirsize); MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize); hashp->dir = p; hashp->hctl->dsize = new_dsize; /* XXX assume the allocator is palloc, so we know how to free */ Assert(hashp->alloc == DynaHashAlloc); pfree(old_p); return true; } return false; } static HASHSEGMENT seg_alloc(HTAB *hashp) { HASHSEGMENT segp; CurrentDynaHashCxt = hashp->hcxt; segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize); if (!segp) return NULL; MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize); return segp; } /* * allocate some new elements and link them into the indicated free list */ static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx) { HASHHDR *hctl = hashp->hctl; Size elementSize; HASHELEMENT *firstElement; HASHELEMENT *tmpElement; HASHELEMENT *prevElement; int i; if (hashp->isfixed) return false; /* Each element has a HASHELEMENT header plus user data. */ elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize); CurrentDynaHashCxt = hashp->hcxt; firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize); if (!firstElement) return false; /* prepare to link all the new entries into the freelist */ prevElement = NULL; tmpElement = firstElement; for (i = 0; i < nelem; i++) { tmpElement->link = prevElement; prevElement = tmpElement; tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize); } /* if partitioned, must lock to touch freeList */ if (IS_PARTITIONED(hctl)) SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); /* freelist could be nonempty if two backends did this concurrently */ firstElement->link = hctl->freeList[freelist_idx].freeList; hctl->freeList[freelist_idx].freeList = prevElement; if (IS_PARTITIONED(hctl)) SpinLockRelease(&hctl->freeList[freelist_idx].mutex); return true; } /* complain when we have detected a corrupted hashtable */ static void hash_corrupted(HTAB *hashp) { /* * If the corruption is in a shared hashtable, we'd better force a * systemwide restart. Otherwise, just shut down this one backend. */ if (hashp->isshared) elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname); else elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname); } /* calculate ceil(log base 2) of num */ int my_log2(long num) { /* * guard against too-large input, which would be invalid for * pg_ceil_log2_*() */ if (num > LONG_MAX / 2) num = LONG_MAX / 2; #if SIZEOF_LONG < 8 return pg_ceil_log2_32(num); #else return pg_ceil_log2_64(num); #endif } /* calculate first power of 2 >= num, bounded to what will fit in a long */ static long next_pow2_long(long num) { /* my_log2's internal range check is sufficient */ return 1L << my_log2(num); } /* calculate first power of 2 >= num, bounded to what will fit in an int */ static int next_pow2_int(long num) { if (num > INT_MAX / 2) num = INT_MAX / 2; return 1 << my_log2(num); } /************************* SEQ SCAN TRACKING ************************/ /* * We track active hash_seq_search scans here. The need for this mechanism * comes from the fact that a scan will get confused if a bucket split occurs * while it's in progress: it might visit entries twice, or even miss some * entirely (if it's partway through the same bucket that splits). Hence * we want to inhibit bucket splits if there are any active scans on the * table being inserted into. This is a fairly rare case in current usage, * so just postponing the split until the next insertion seems sufficient. * * Given present usages of the function, only a few scans are likely to be * open concurrently; so a finite-size stack of open scans seems sufficient, * and we don't worry that linear search is too slow. Note that we do * allow multiple scans of the same hashtable to be open concurrently. * * This mechanism can support concurrent scan and insertion in a shared * hashtable if it's the same backend doing both. It would fail otherwise, * but locking reasons seem to preclude any such scenario anyway, so we don't * worry. * * This arrangement is reasonably robust if a transient hashtable is deleted * without notifying us. The absolute worst case is we might inhibit splits * in another table created later at exactly the same address. We will give * a warning at transaction end for reference leaks, so any bugs leading to * lack of notification should be easy to catch. */ #define MAX_SEQ_SCANS 100 static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */ static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */ static int num_seq_scans = 0; /* Register a table as having an active hash_seq_search scan */ static void register_seq_scan(HTAB *hashp) { if (num_seq_scans >= MAX_SEQ_SCANS) elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"", hashp->tabname); seq_scan_tables[num_seq_scans] = hashp; seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel(); num_seq_scans++; } /* Deregister an active scan */ static void deregister_seq_scan(HTAB *hashp) { int i; /* Search backward since it's most likely at the stack top */ for (i = num_seq_scans - 1; i >= 0; i--) { if (seq_scan_tables[i] == hashp) { seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; num_seq_scans--; return; } } elog(ERROR, "no hash_seq_search scan for hash table \"%s\"", hashp->tabname); } /* Check if a table has any active scan */ static bool has_seq_scans(HTAB *hashp) { int i; for (i = 0; i < num_seq_scans; i++) { if (seq_scan_tables[i] == hashp) return true; } return false; } /* Clean up any open scans at end of transaction */ void AtEOXact_HashTables(bool isCommit) { /* * During abort cleanup, open scans are expected; just silently clean 'em * out. An open scan at commit means someone forgot a hash_seq_term() * call, so complain. * * Note: it's tempting to try to print the tabname here, but refrain for * fear of touching deallocated memory. This isn't a user-facing message * anyway, so it needn't be pretty. */ if (isCommit) { int i; for (i = 0; i < num_seq_scans; i++) { elog(WARNING, "leaked hash_seq_search scan for hash table %p", seq_scan_tables[i]); } } num_seq_scans = 0; } /* Clean up any open scans at end of subtransaction */ void AtEOSubXact_HashTables(bool isCommit, int nestDepth) { int i; /* * Search backward to make cleanup easy. Note we must check all entries, * not only those at the end of the array, because deletion technique * doesn't keep them in order. */ for (i = num_seq_scans - 1; i >= 0; i--) { if (seq_scan_level[i] >= nestDepth) { if (isCommit) elog(WARNING, "leaked hash_seq_search scan for hash table %p", seq_scan_tables[i]); seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1]; seq_scan_level[i] = seq_scan_level[num_seq_scans - 1]; num_seq_scans--; } } }