diff options
Diffstat (limited to 'src/backend/nodes/tidbitmap.c')
-rw-r--r-- | src/backend/nodes/tidbitmap.c | 1561 |
1 files changed, 1561 insertions, 0 deletions
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c new file mode 100644 index 0000000..c5feacb --- /dev/null +++ b/src/backend/nodes/tidbitmap.c @@ -0,0 +1,1561 @@ +/*------------------------------------------------------------------------- + * + * tidbitmap.c + * PostgreSQL tuple-id (TID) bitmap package + * + * This module provides bitmap data structures that are spiritually + * similar to Bitmapsets, but are specially adapted to store sets of + * tuple identifiers (TIDs), or ItemPointers. In particular, the division + * of an ItemPointer into BlockNumber and OffsetNumber is catered for. + * Also, since we wish to be able to store very large tuple sets in + * memory with this data structure, we support "lossy" storage, in which + * we no longer remember individual tuple offsets on a page but only the + * fact that a particular page needs to be visited. + * + * The "lossy" storage uses one bit per disk page, so at the standard 8K + * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb + * of memory. People pushing around tables of that size should have a + * couple of Mb to spare, so we don't worry about providing a second level + * of lossiness. In theory we could fall back to page ranges at some + * point, but for now that seems useless complexity. + * + * We also support the notion of candidate matches, or rechecking. This + * means we know that a search need visit only some tuples on a page, + * but we are not certain that all of those tuples are real matches. + * So the eventual heap scan must recheck the quals for these tuples only, + * rather than rechecking the quals for all tuples on the page as in the + * lossy-bitmap case. Rechecking can be specified when TIDs are inserted + * into a bitmap, and it can also happen internally when we AND a lossy + * and a non-lossy page. + * + * + * Copyright (c) 2003-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/nodes/tidbitmap.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <limits.h> + +#include "access/htup_details.h" +#include "common/hashfn.h" +#include "nodes/bitmapset.h" +#include "nodes/tidbitmap.h" +#include "storage/lwlock.h" +#include "utils/dsa.h" + +/* + * The maximum number of tuples per page is not large (typically 256 with + * 8K pages, or 1024 with 32K pages). So there's not much point in making + * the per-page bitmaps variable size. We just legislate that the size + * is this: + */ +#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage + +/* + * When we have to switch over to lossy storage, we use a data structure + * with one bit per page, where all pages having the same number DIV + * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present + * and has the bit set for a given page, there must not be a per-page entry + * for that page in the page table. + * + * We actually store both exact pages and lossy chunks in the same hash + * table, using identical data structures. (This is because the memory + * management for hashtables doesn't easily/efficiently allow space to be + * transferred easily from one hashtable to another.) Therefore it's best + * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not + * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to + * avoid expensive integer remainder operations. So, define it like this: + */ +#define PAGES_PER_CHUNK (BLCKSZ / 32) + +/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */ + +#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD) +#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD) + +/* number of active words for an exact page: */ +#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1) +/* number of active words for a lossy chunk: */ +#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1) + +/* + * The hashtable entries are represented by this data structure. For + * an exact page, blockno is the page number and bit k of the bitmap + * represents tuple offset k+1. For a lossy chunk, blockno is the first + * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and + * bit k represents page blockno+k. Note that it is not possible to + * have exact storage for the first page of a chunk if we are using + * lossy storage for any page in the chunk's range, since the same + * hashtable entry has to serve both purposes. + * + * recheck is used only on exact pages --- it indicates that although + * only the stated tuples need be checked, the full index qual condition + * must be checked for each (ie, these are candidate matches). + */ +typedef struct PagetableEntry +{ + BlockNumber blockno; /* page number (hashtable key) */ + char status; /* hash entry status */ + bool ischunk; /* T = lossy storage, F = exact */ + bool recheck; /* should the tuples be rechecked? */ + bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]; +} PagetableEntry; + +/* + * Holds array of pagetable entries. + */ +typedef struct PTEntryArray +{ + pg_atomic_uint32 refcount; /* no. of iterator attached */ + PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]; +} PTEntryArray; + +/* + * We want to avoid the overhead of creating the hashtable, which is + * comparatively large, when not necessary. Particularly when we are using a + * bitmap scan on the inside of a nestloop join: a bitmap may well live only + * long enough to accumulate one entry in such cases. We therefore avoid + * creating an actual hashtable until we need two pagetable entries. When + * just one pagetable entry is needed, we store it in a fixed field of + * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later + * shrinks down to zero or one page again. So, status can be TBM_HASH even + * when nentries is zero or one.) + */ +typedef enum +{ + TBM_EMPTY, /* no hashtable, nentries == 0 */ + TBM_ONE_PAGE, /* entry1 contains the single entry */ + TBM_HASH /* pagetable is valid, entry1 is not */ +} TBMStatus; + +/* + * Current iterating state of the TBM. + */ +typedef enum +{ + TBM_NOT_ITERATING, /* not yet converted to page and chunk array */ + TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */ + TBM_ITERATING_SHARED /* converted to shared page and chunk array */ +} TBMIteratingState; + +/* + * Here is the representation for a whole TIDBitMap: + */ +struct TIDBitmap +{ + NodeTag type; /* to make it a valid Node */ + MemoryContext mcxt; /* memory context containing me */ + TBMStatus status; /* see codes above */ + struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */ + int nentries; /* number of entries in pagetable */ + int maxentries; /* limit on same to meet maxbytes */ + int npages; /* number of exact entries in pagetable */ + int nchunks; /* number of lossy entries in pagetable */ + TBMIteratingState iterating; /* tbm_begin_iterate called? */ + uint32 lossify_start; /* offset to start lossifying hashtable at */ + PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ + /* these are valid when iterating is true: */ + PagetableEntry **spages; /* sorted exact-page list, or NULL */ + PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ + dsa_pointer dsapagetable; /* dsa_pointer to the element array */ + dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */ + dsa_pointer ptpages; /* dsa_pointer to the page array */ + dsa_pointer ptchunks; /* dsa_pointer to the chunk array */ + dsa_area *dsa; /* reference to per-query dsa area */ +}; + +/* + * When iterating over a bitmap in sorted order, a TBMIterator is used to + * track our progress. There can be several iterators scanning the same + * bitmap concurrently. Note that the bitmap becomes read-only as soon as + * any iterator is created. + */ +struct TBMIterator +{ + TIDBitmap *tbm; /* TIDBitmap we're iterating over */ + int spageptr; /* next spages index */ + int schunkptr; /* next schunks index */ + int schunkbit; /* next bit to check in current schunk */ + TBMIterateResult output; /* MUST BE LAST (because variable-size) */ +}; + +/* + * Holds the shared members of the iterator so that multiple processes + * can jointly iterate. + */ +typedef struct TBMSharedIteratorState +{ + int nentries; /* number of entries in pagetable */ + int maxentries; /* limit on same to meet maxbytes */ + int npages; /* number of exact entries in pagetable */ + int nchunks; /* number of lossy entries in pagetable */ + dsa_pointer pagetable; /* dsa pointers to head of pagetable data */ + dsa_pointer spages; /* dsa pointer to page array */ + dsa_pointer schunks; /* dsa pointer to chunk array */ + LWLock lock; /* lock to protect below members */ + int spageptr; /* next spages index */ + int schunkptr; /* next schunks index */ + int schunkbit; /* next bit to check in current schunk */ +} TBMSharedIteratorState; + +/* + * pagetable iteration array. + */ +typedef struct PTIterationArray +{ + pg_atomic_uint32 refcount; /* no. of iterator attached */ + int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */ +} PTIterationArray; + +/* + * same as TBMIterator, but it is used for joint iteration, therefore this + * also holds a reference to the shared state. + */ +struct TBMSharedIterator +{ + TBMSharedIteratorState *state; /* shared state */ + PTEntryArray *ptbase; /* pagetable element array */ + PTIterationArray *ptpages; /* sorted exact page index list */ + PTIterationArray *ptchunks; /* sorted lossy page index list */ + TBMIterateResult output; /* MUST BE LAST (because variable-size) */ +}; + +/* Local function prototypes */ +static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage); +static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, + const TIDBitmap *b); +static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm, + BlockNumber pageno); +static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno); +static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno); +static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); +static void tbm_lossify(TIDBitmap *tbm); +static int tbm_comparator(const void *left, const void *right); +static int tbm_shared_comparator(const void *left, const void *right, + void *arg); + +/* define hashtable mapping block numbers to PagetableEntry's */ +#define SH_USE_NONDEFAULT_ALLOCATOR +#define SH_PREFIX pagetable +#define SH_ELEMENT_TYPE PagetableEntry +#define SH_KEY_TYPE BlockNumber +#define SH_KEY blockno +#define SH_HASH_KEY(tb, key) murmurhash32(key) +#define SH_EQUAL(tb, a, b) a == b +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + + +/* + * tbm_create - create an initially-empty bitmap + * + * The bitmap will live in the memory context that is CurrentMemoryContext + * at the time of this call. It will be limited to (approximately) maxbytes + * total memory consumption. If the DSA passed to this function is not NULL + * then the memory for storing elements of the underlying page table will + * be allocated from the DSA. + */ +TIDBitmap * +tbm_create(long maxbytes, dsa_area *dsa) +{ + TIDBitmap *tbm; + + /* Create the TIDBitmap struct and zero all its fields */ + tbm = makeNode(TIDBitmap); + + tbm->mcxt = CurrentMemoryContext; + tbm->status = TBM_EMPTY; + + tbm->maxentries = (int) tbm_calculate_entries(maxbytes); + tbm->lossify_start = 0; + tbm->dsa = dsa; + tbm->dsapagetable = InvalidDsaPointer; + tbm->dsapagetableold = InvalidDsaPointer; + tbm->ptpages = InvalidDsaPointer; + tbm->ptchunks = InvalidDsaPointer; + + return tbm; +} + +/* + * Actually create the hashtable. Since this is a moderately expensive + * proposition, we don't do it until we have to. + */ +static void +tbm_create_pagetable(TIDBitmap *tbm) +{ + Assert(tbm->status != TBM_HASH); + Assert(tbm->pagetable == NULL); + + tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm); + + /* If entry1 is valid, push it into the hashtable */ + if (tbm->status == TBM_ONE_PAGE) + { + PagetableEntry *page; + bool found; + char oldstatus; + + page = pagetable_insert(tbm->pagetable, + tbm->entry1.blockno, + &found); + Assert(!found); + oldstatus = page->status; + memcpy(page, &tbm->entry1, sizeof(PagetableEntry)); + page->status = oldstatus; + } + + tbm->status = TBM_HASH; +} + +/* + * tbm_free - free a TIDBitmap + */ +void +tbm_free(TIDBitmap *tbm) +{ + if (tbm->pagetable) + pagetable_destroy(tbm->pagetable); + if (tbm->spages) + pfree(tbm->spages); + if (tbm->schunks) + pfree(tbm->schunks); + pfree(tbm); +} + +/* + * tbm_free_shared_area - free shared state + * + * Free shared iterator state, Also free shared pagetable and iterator arrays + * memory if they are not referred by any of the shared iterator i.e recount + * is becomes 0. + */ +void +tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp) +{ + TBMSharedIteratorState *istate = dsa_get_address(dsa, dp); + PTEntryArray *ptbase; + PTIterationArray *ptpages; + PTIterationArray *ptchunks; + + if (DsaPointerIsValid(istate->pagetable)) + { + ptbase = dsa_get_address(dsa, istate->pagetable); + if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0) + dsa_free(dsa, istate->pagetable); + } + if (DsaPointerIsValid(istate->spages)) + { + ptpages = dsa_get_address(dsa, istate->spages); + if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0) + dsa_free(dsa, istate->spages); + } + if (DsaPointerIsValid(istate->schunks)) + { + ptchunks = dsa_get_address(dsa, istate->schunks); + if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0) + dsa_free(dsa, istate->schunks); + } + + dsa_free(dsa, dp); +} + +/* + * tbm_add_tuples - add some tuple IDs to a TIDBitmap + * + * If recheck is true, then the recheck flag will be set in the + * TBMIterateResult when any of these tuples are reported out. + */ +void +tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, + bool recheck) +{ + BlockNumber currblk = InvalidBlockNumber; + PagetableEntry *page = NULL; /* only valid when currblk is valid */ + int i; + + Assert(tbm->iterating == TBM_NOT_ITERATING); + for (i = 0; i < ntids; i++) + { + BlockNumber blk = ItemPointerGetBlockNumber(tids + i); + OffsetNumber off = ItemPointerGetOffsetNumber(tids + i); + int wordnum, + bitnum; + + /* safety check to ensure we don't overrun bit array bounds */ + if (off < 1 || off > MAX_TUPLES_PER_PAGE) + elog(ERROR, "tuple offset out of range: %u", off); + + /* + * Look up target page unless we already did. This saves cycles when + * the input includes consecutive tuples on the same page, which is + * common enough to justify an extra test here. + */ + if (blk != currblk) + { + if (tbm_page_is_lossy(tbm, blk)) + page = NULL; /* remember page is lossy */ + else + page = tbm_get_pageentry(tbm, blk); + currblk = blk; + } + + if (page == NULL) + continue; /* whole page is already marked */ + + if (page->ischunk) + { + /* The page is a lossy chunk header, set bit for itself */ + wordnum = bitnum = 0; + } + else + { + /* Page is exact, so set bit for individual tuple */ + wordnum = WORDNUM(off - 1); + bitnum = BITNUM(off - 1); + } + page->words[wordnum] |= ((bitmapword) 1 << bitnum); + page->recheck |= recheck; + + if (tbm->nentries > tbm->maxentries) + { + tbm_lossify(tbm); + /* Page could have been converted to lossy, so force new lookup */ + currblk = InvalidBlockNumber; + } + } +} + +/* + * tbm_add_page - add a whole page to a TIDBitmap + * + * This causes the whole page to be reported (with the recheck flag) + * when the TIDBitmap is scanned. + */ +void +tbm_add_page(TIDBitmap *tbm, BlockNumber pageno) +{ + /* Enter the page in the bitmap, or mark it lossy if already present */ + tbm_mark_page_lossy(tbm, pageno); + /* If we went over the memory limit, lossify some more pages */ + if (tbm->nentries > tbm->maxentries) + tbm_lossify(tbm); +} + +/* + * tbm_union - set union + * + * a is modified in-place, b is not changed + */ +void +tbm_union(TIDBitmap *a, const TIDBitmap *b) +{ + Assert(!a->iterating); + /* Nothing to do if b is empty */ + if (b->nentries == 0) + return; + /* Scan through chunks and pages in b, merge into a */ + if (b->status == TBM_ONE_PAGE) + tbm_union_page(a, &b->entry1); + else + { + pagetable_iterator i; + PagetableEntry *bpage; + + Assert(b->status == TBM_HASH); + pagetable_start_iterate(b->pagetable, &i); + while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL) + tbm_union_page(a, bpage); + } +} + +/* Process one page of b during a union op */ +static void +tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage) +{ + PagetableEntry *apage; + int wordnum; + + if (bpage->ischunk) + { + /* Scan b's chunk, mark each indicated page lossy in a */ + for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) + { + bitmapword w = bpage->words[wordnum]; + + if (w != 0) + { + BlockNumber pg; + + pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD); + while (w != 0) + { + if (w & 1) + tbm_mark_page_lossy(a, pg); + pg++; + w >>= 1; + } + } + } + } + else if (tbm_page_is_lossy(a, bpage->blockno)) + { + /* page is already lossy in a, nothing to do */ + return; + } + else + { + apage = tbm_get_pageentry(a, bpage->blockno); + if (apage->ischunk) + { + /* The page is a lossy chunk header, set bit for itself */ + apage->words[0] |= ((bitmapword) 1 << 0); + } + else + { + /* Both pages are exact, merge at the bit level */ + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + apage->words[wordnum] |= bpage->words[wordnum]; + apage->recheck |= bpage->recheck; + } + } + + if (a->nentries > a->maxentries) + tbm_lossify(a); +} + +/* + * tbm_intersect - set intersection + * + * a is modified in-place, b is not changed + */ +void +tbm_intersect(TIDBitmap *a, const TIDBitmap *b) +{ + Assert(!a->iterating); + /* Nothing to do if a is empty */ + if (a->nentries == 0) + return; + /* Scan through chunks and pages in a, try to match to b */ + if (a->status == TBM_ONE_PAGE) + { + if (tbm_intersect_page(a, &a->entry1, b)) + { + /* Page is now empty, remove it from a */ + Assert(!a->entry1.ischunk); + a->npages--; + a->nentries--; + Assert(a->nentries == 0); + a->status = TBM_EMPTY; + } + } + else + { + pagetable_iterator i; + PagetableEntry *apage; + + Assert(a->status == TBM_HASH); + pagetable_start_iterate(a->pagetable, &i); + while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL) + { + if (tbm_intersect_page(a, apage, b)) + { + /* Page or chunk is now empty, remove it from a */ + if (apage->ischunk) + a->nchunks--; + else + a->npages--; + a->nentries--; + if (!pagetable_delete(a->pagetable, apage->blockno)) + elog(ERROR, "hash table corrupted"); + } + } + } +} + +/* + * Process one page of a during an intersection op + * + * Returns true if apage is now empty and should be deleted from a + */ +static bool +tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) +{ + const PagetableEntry *bpage; + int wordnum; + + if (apage->ischunk) + { + /* Scan each bit in chunk, try to clear */ + bool candelete = true; + + for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) + { + bitmapword w = apage->words[wordnum]; + + if (w != 0) + { + bitmapword neww = w; + BlockNumber pg; + int bitnum; + + pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD); + bitnum = 0; + while (w != 0) + { + if (w & 1) + { + if (!tbm_page_is_lossy(b, pg) && + tbm_find_pageentry(b, pg) == NULL) + { + /* Page is not in b at all, lose lossy bit */ + neww &= ~((bitmapword) 1 << bitnum); + } + } + pg++; + bitnum++; + w >>= 1; + } + apage->words[wordnum] = neww; + if (neww != 0) + candelete = false; + } + } + return candelete; + } + else if (tbm_page_is_lossy(b, apage->blockno)) + { + /* + * Some of the tuples in 'a' might not satisfy the quals for 'b', but + * because the page 'b' is lossy, we don't know which ones. Therefore + * we mark 'a' as requiring rechecks, to indicate that at most those + * tuples set in 'a' are matches. + */ + apage->recheck = true; + return false; + } + else + { + bool candelete = true; + + bpage = tbm_find_pageentry(b, apage->blockno); + if (bpage != NULL) + { + /* Both pages are exact, merge at the bit level */ + Assert(!bpage->ischunk); + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + { + apage->words[wordnum] &= bpage->words[wordnum]; + if (apage->words[wordnum] != 0) + candelete = false; + } + apage->recheck |= bpage->recheck; + } + /* If there is no matching b page, we can just delete the a page */ + return candelete; + } +} + +/* + * tbm_is_empty - is a TIDBitmap completely empty? + */ +bool +tbm_is_empty(const TIDBitmap *tbm) +{ + return (tbm->nentries == 0); +} + +/* + * tbm_begin_iterate - prepare to iterate through a TIDBitmap + * + * The TBMIterator struct is created in the caller's memory context. + * For a clean shutdown of the iteration, call tbm_end_iterate; but it's + * okay to just allow the memory context to be released, too. It is caller's + * responsibility not to touch the TBMIterator anymore once the TIDBitmap + * is freed. + * + * NB: after this is called, it is no longer allowed to modify the contents + * of the bitmap. However, you can call this multiple times to scan the + * contents repeatedly, including parallel scans. + */ +TBMIterator * +tbm_begin_iterate(TIDBitmap *tbm) +{ + TBMIterator *iterator; + + Assert(tbm->iterating != TBM_ITERATING_SHARED); + + /* + * Create the TBMIterator struct, with enough trailing space to serve the + * needs of the TBMIterateResult sub-struct. + */ + iterator = (TBMIterator *) palloc(sizeof(TBMIterator) + + MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + iterator->tbm = tbm; + + /* + * Initialize iteration pointers. + */ + iterator->spageptr = 0; + iterator->schunkptr = 0; + iterator->schunkbit = 0; + + /* + * If we have a hashtable, create and fill the sorted page lists, unless + * we already did that for a previous iterator. Note that the lists are + * attached to the bitmap not the iterator, so they can be used by more + * than one iterator. + */ + if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING) + { + pagetable_iterator i; + PagetableEntry *page; + int npages; + int nchunks; + + if (!tbm->spages && tbm->npages > 0) + tbm->spages = (PagetableEntry **) + MemoryContextAlloc(tbm->mcxt, + tbm->npages * sizeof(PagetableEntry *)); + if (!tbm->schunks && tbm->nchunks > 0) + tbm->schunks = (PagetableEntry **) + MemoryContextAlloc(tbm->mcxt, + tbm->nchunks * sizeof(PagetableEntry *)); + + npages = nchunks = 0; + pagetable_start_iterate(tbm->pagetable, &i); + while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) + { + if (page->ischunk) + tbm->schunks[nchunks++] = page; + else + tbm->spages[npages++] = page; + } + Assert(npages == tbm->npages); + Assert(nchunks == tbm->nchunks); + if (npages > 1) + qsort(tbm->spages, npages, sizeof(PagetableEntry *), + tbm_comparator); + if (nchunks > 1) + qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), + tbm_comparator); + } + + tbm->iterating = TBM_ITERATING_PRIVATE; + + return iterator; +} + +/* + * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap. + * + * The necessary shared state will be allocated from the DSA passed to + * tbm_create, so that multiple processes can attach to it and iterate jointly. + * + * This will convert the pagetable hash into page and chunk array of the index + * into pagetable array. + */ +dsa_pointer +tbm_prepare_shared_iterate(TIDBitmap *tbm) +{ + dsa_pointer dp; + TBMSharedIteratorState *istate; + PTEntryArray *ptbase = NULL; + PTIterationArray *ptpages = NULL; + PTIterationArray *ptchunks = NULL; + + Assert(tbm->dsa != NULL); + Assert(tbm->iterating != TBM_ITERATING_PRIVATE); + + /* + * Allocate TBMSharedIteratorState from DSA to hold the shared members and + * lock, this will also be used by multiple worker for shared iterate. + */ + dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState)); + istate = dsa_get_address(tbm->dsa, dp); + + /* + * If we're not already iterating, create and fill the sorted page lists. + * (If we are, the sorted page lists are already stored in the TIDBitmap, + * and we can just reuse them.) + */ + if (tbm->iterating == TBM_NOT_ITERATING) + { + pagetable_iterator i; + PagetableEntry *page; + int idx; + int npages; + int nchunks; + + /* + * Allocate the page and chunk array memory from the DSA to share + * across multiple processes. + */ + if (tbm->npages) + { + tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) + + tbm->npages * sizeof(int)); + ptpages = dsa_get_address(tbm->dsa, tbm->ptpages); + pg_atomic_init_u32(&ptpages->refcount, 0); + } + if (tbm->nchunks) + { + tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) + + tbm->nchunks * sizeof(int)); + ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks); + pg_atomic_init_u32(&ptchunks->refcount, 0); + } + + /* + * If TBM status is TBM_HASH then iterate over the pagetable and + * convert it to page and chunk arrays. But if it's in the + * TBM_ONE_PAGE mode then directly allocate the space for one entry + * from the DSA. + */ + npages = nchunks = 0; + if (tbm->status == TBM_HASH) + { + ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); + + pagetable_start_iterate(tbm->pagetable, &i); + while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) + { + idx = page - ptbase->ptentry; + if (page->ischunk) + ptchunks->index[nchunks++] = idx; + else + ptpages->index[npages++] = idx; + } + + Assert(npages == tbm->npages); + Assert(nchunks == tbm->nchunks); + } + else if (tbm->status == TBM_ONE_PAGE) + { + /* + * In one page mode allocate the space for one pagetable entry, + * initialize it, and directly store its index (i.e. 0) in the + * page array. + */ + tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) + + sizeof(PagetableEntry)); + ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); + memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry)); + ptpages->index[0] = 0; + } + + if (ptbase != NULL) + pg_atomic_init_u32(&ptbase->refcount, 0); + if (npages > 1) + qsort_arg((void *) (ptpages->index), npages, sizeof(int), + tbm_shared_comparator, (void *) ptbase->ptentry); + if (nchunks > 1) + qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int), + tbm_shared_comparator, (void *) ptbase->ptentry); + } + + /* + * Store the TBM members in the shared state so that we can share them + * across multiple processes. + */ + istate->nentries = tbm->nentries; + istate->maxentries = tbm->maxentries; + istate->npages = tbm->npages; + istate->nchunks = tbm->nchunks; + istate->pagetable = tbm->dsapagetable; + istate->spages = tbm->ptpages; + istate->schunks = tbm->ptchunks; + + ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); + ptpages = dsa_get_address(tbm->dsa, tbm->ptpages); + ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks); + + /* + * For every shared iterator, referring to pagetable and iterator array, + * increase the refcount by 1 so that while freeing the shared iterator we + * don't free pagetable and iterator array until its refcount becomes 0. + */ + if (ptbase != NULL) + pg_atomic_add_fetch_u32(&ptbase->refcount, 1); + if (ptpages != NULL) + pg_atomic_add_fetch_u32(&ptpages->refcount, 1); + if (ptchunks != NULL) + pg_atomic_add_fetch_u32(&ptchunks->refcount, 1); + + /* Initialize the iterator lock */ + LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP); + + /* Initialize the shared iterator state */ + istate->schunkbit = 0; + istate->schunkptr = 0; + istate->spageptr = 0; + + tbm->iterating = TBM_ITERATING_SHARED; + + return dp; +} + +/* + * tbm_extract_page_tuple - extract the tuple offsets from a page + * + * The extracted offsets are stored into TBMIterateResult. + */ +static inline int +tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output) +{ + int wordnum; + int ntuples = 0; + + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + { + bitmapword w = page->words[wordnum]; + + if (w != 0) + { + int off = wordnum * BITS_PER_BITMAPWORD + 1; + + while (w != 0) + { + if (w & 1) + output->offsets[ntuples++] = (OffsetNumber) off; + off++; + w >>= 1; + } + } + } + + return ntuples; +} + +/* + * tbm_advance_schunkbit - Advance the schunkbit + */ +static inline void +tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp) +{ + int schunkbit = *schunkbitp; + + while (schunkbit < PAGES_PER_CHUNK) + { + int wordnum = WORDNUM(schunkbit); + int bitnum = BITNUM(schunkbit); + + if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) + break; + schunkbit++; + } + + *schunkbitp = schunkbit; +} + +/* + * tbm_iterate - scan through next page of a TIDBitmap + * + * Returns a TBMIterateResult representing one page, or NULL if there are + * no more pages to scan. Pages are guaranteed to be delivered in numerical + * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to + * remember the exact tuples to look at on this page --- the caller must + * examine all tuples on the page and check if they meet the intended + * condition. If result->recheck is true, only the indicated tuples need + * be examined, but the condition must be rechecked anyway. (For ease of + * testing, recheck is always set true when ntuples < 0.) + */ +TBMIterateResult * +tbm_iterate(TBMIterator *iterator) +{ + TIDBitmap *tbm = iterator->tbm; + TBMIterateResult *output = &(iterator->output); + + Assert(tbm->iterating == TBM_ITERATING_PRIVATE); + + /* + * If lossy chunk pages remain, make sure we've advanced schunkptr/ + * schunkbit to the next set bit. + */ + while (iterator->schunkptr < tbm->nchunks) + { + PagetableEntry *chunk = tbm->schunks[iterator->schunkptr]; + int schunkbit = iterator->schunkbit; + + tbm_advance_schunkbit(chunk, &schunkbit); + if (schunkbit < PAGES_PER_CHUNK) + { + iterator->schunkbit = schunkbit; + break; + } + /* advance to next chunk */ + iterator->schunkptr++; + iterator->schunkbit = 0; + } + + /* + * If both chunk and per-page data remain, must output the numerically + * earlier page. + */ + if (iterator->schunkptr < tbm->nchunks) + { + PagetableEntry *chunk = tbm->schunks[iterator->schunkptr]; + BlockNumber chunk_blockno; + + chunk_blockno = chunk->blockno + iterator->schunkbit; + if (iterator->spageptr >= tbm->npages || + chunk_blockno < tbm->spages[iterator->spageptr]->blockno) + { + /* Return a lossy page indicator from the chunk */ + output->blockno = chunk_blockno; + output->ntuples = -1; + output->recheck = true; + iterator->schunkbit++; + return output; + } + } + + if (iterator->spageptr < tbm->npages) + { + PagetableEntry *page; + int ntuples; + + /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */ + if (tbm->status == TBM_ONE_PAGE) + page = &tbm->entry1; + else + page = tbm->spages[iterator->spageptr]; + + /* scan bitmap to extract individual offset numbers */ + ntuples = tbm_extract_page_tuple(page, output); + output->blockno = page->blockno; + output->ntuples = ntuples; + output->recheck = page->recheck; + iterator->spageptr++; + return output; + } + + /* Nothing more in the bitmap */ + return NULL; +} + +/* + * tbm_shared_iterate - scan through next page of a TIDBitmap + * + * As above, but this will iterate using an iterator which is shared + * across multiple processes. We need to acquire the iterator LWLock, + * before accessing the shared members. + */ +TBMIterateResult * +tbm_shared_iterate(TBMSharedIterator *iterator) +{ + TBMIterateResult *output = &iterator->output; + TBMSharedIteratorState *istate = iterator->state; + PagetableEntry *ptbase = NULL; + int *idxpages = NULL; + int *idxchunks = NULL; + + if (iterator->ptbase != NULL) + ptbase = iterator->ptbase->ptentry; + if (iterator->ptpages != NULL) + idxpages = iterator->ptpages->index; + if (iterator->ptchunks != NULL) + idxchunks = iterator->ptchunks->index; + + /* Acquire the LWLock before accessing the shared members */ + LWLockAcquire(&istate->lock, LW_EXCLUSIVE); + + /* + * If lossy chunk pages remain, make sure we've advanced schunkptr/ + * schunkbit to the next set bit. + */ + while (istate->schunkptr < istate->nchunks) + { + PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]]; + int schunkbit = istate->schunkbit; + + tbm_advance_schunkbit(chunk, &schunkbit); + if (schunkbit < PAGES_PER_CHUNK) + { + istate->schunkbit = schunkbit; + break; + } + /* advance to next chunk */ + istate->schunkptr++; + istate->schunkbit = 0; + } + + /* + * If both chunk and per-page data remain, must output the numerically + * earlier page. + */ + if (istate->schunkptr < istate->nchunks) + { + PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]]; + BlockNumber chunk_blockno; + + chunk_blockno = chunk->blockno + istate->schunkbit; + + if (istate->spageptr >= istate->npages || + chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno) + { + /* Return a lossy page indicator from the chunk */ + output->blockno = chunk_blockno; + output->ntuples = -1; + output->recheck = true; + istate->schunkbit++; + + LWLockRelease(&istate->lock); + return output; + } + } + + if (istate->spageptr < istate->npages) + { + PagetableEntry *page = &ptbase[idxpages[istate->spageptr]]; + int ntuples; + + /* scan bitmap to extract individual offset numbers */ + ntuples = tbm_extract_page_tuple(page, output); + output->blockno = page->blockno; + output->ntuples = ntuples; + output->recheck = page->recheck; + istate->spageptr++; + + LWLockRelease(&istate->lock); + + return output; + } + + LWLockRelease(&istate->lock); + + /* Nothing more in the bitmap */ + return NULL; +} + +/* + * tbm_end_iterate - finish an iteration over a TIDBitmap + * + * Currently this is just a pfree, but it might do more someday. (For + * instance, it could be useful to count open iterators and allow the + * bitmap to return to read/write status when there are no more iterators.) + */ +void +tbm_end_iterate(TBMIterator *iterator) +{ + pfree(iterator); +} + +/* + * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap + * + * This doesn't free any of the shared state associated with the iterator, + * just our backend-private state. + */ +void +tbm_end_shared_iterate(TBMSharedIterator *iterator) +{ + pfree(iterator); +} + +/* + * tbm_find_pageentry - find a PagetableEntry for the pageno + * + * Returns NULL if there is no non-lossy entry for the pageno. + */ +static const PagetableEntry * +tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) +{ + const PagetableEntry *page; + + if (tbm->nentries == 0) /* in case pagetable doesn't exist */ + return NULL; + + if (tbm->status == TBM_ONE_PAGE) + { + page = &tbm->entry1; + if (page->blockno != pageno) + return NULL; + Assert(!page->ischunk); + return page; + } + + page = pagetable_lookup(tbm->pagetable, pageno); + if (page == NULL) + return NULL; + if (page->ischunk) + return NULL; /* don't want a lossy chunk header */ + return page; +} + +/* + * tbm_get_pageentry - find or create a PagetableEntry for the pageno + * + * If new, the entry is marked as an exact (non-chunk) entry. + * + * This may cause the table to exceed the desired memory size. It is + * up to the caller to call tbm_lossify() at the next safe point if so. + */ +static PagetableEntry * +tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + bool found; + + if (tbm->status == TBM_EMPTY) + { + /* Use the fixed slot */ + page = &tbm->entry1; + found = false; + tbm->status = TBM_ONE_PAGE; + } + else + { + if (tbm->status == TBM_ONE_PAGE) + { + page = &tbm->entry1; + if (page->blockno == pageno) + return page; + /* Time to switch from one page to a hashtable */ + tbm_create_pagetable(tbm); + } + + /* Look up or create an entry */ + page = pagetable_insert(tbm->pagetable, pageno, &found); + } + + /* Initialize it if not present before */ + if (!found) + { + char oldstatus = page->status; + + MemSet(page, 0, sizeof(PagetableEntry)); + page->status = oldstatus; + page->blockno = pageno; + /* must count it too */ + tbm->nentries++; + tbm->npages++; + } + + return page; +} + +/* + * tbm_page_is_lossy - is the page marked as lossily stored? + */ +static bool +tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + BlockNumber chunk_pageno; + int bitno; + + /* we can skip the lookup if there are no lossy chunks */ + if (tbm->nchunks == 0) + return false; + Assert(tbm->status == TBM_HASH); + + bitno = pageno % PAGES_PER_CHUNK; + chunk_pageno = pageno - bitno; + + page = pagetable_lookup(tbm->pagetable, chunk_pageno); + + if (page != NULL && page->ischunk) + { + int wordnum = WORDNUM(bitno); + int bitnum = BITNUM(bitno); + + if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) + return true; + } + return false; +} + +/* + * tbm_mark_page_lossy - mark the page number as lossily stored + * + * This may cause the table to exceed the desired memory size. It is + * up to the caller to call tbm_lossify() at the next safe point if so. + */ +static void +tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + bool found; + BlockNumber chunk_pageno; + int bitno; + int wordnum; + int bitnum; + + /* We force the bitmap into hashtable mode whenever it's lossy */ + if (tbm->status != TBM_HASH) + tbm_create_pagetable(tbm); + + bitno = pageno % PAGES_PER_CHUNK; + chunk_pageno = pageno - bitno; + + /* + * Remove any extant non-lossy entry for the page. If the page is its own + * chunk header, however, we skip this and handle the case below. + */ + if (bitno != 0) + { + if (pagetable_delete(tbm->pagetable, pageno)) + { + /* It was present, so adjust counts */ + tbm->nentries--; + tbm->npages--; /* assume it must have been non-lossy */ + } + } + + /* Look up or create entry for chunk-header page */ + page = pagetable_insert(tbm->pagetable, chunk_pageno, &found); + + /* Initialize it if not present before */ + if (!found) + { + char oldstatus = page->status; + + MemSet(page, 0, sizeof(PagetableEntry)); + page->status = oldstatus; + page->blockno = chunk_pageno; + page->ischunk = true; + /* must count it too */ + tbm->nentries++; + tbm->nchunks++; + } + else if (!page->ischunk) + { + char oldstatus = page->status; + + /* chunk header page was formerly non-lossy, make it lossy */ + MemSet(page, 0, sizeof(PagetableEntry)); + page->status = oldstatus; + page->blockno = chunk_pageno; + page->ischunk = true; + /* we assume it had some tuple bit(s) set, so mark it lossy */ + page->words[0] = ((bitmapword) 1 << 0); + /* adjust counts */ + tbm->nchunks++; + tbm->npages--; + } + + /* Now set the original target page's bit */ + wordnum = WORDNUM(bitno); + bitnum = BITNUM(bitno); + page->words[wordnum] |= ((bitmapword) 1 << bitnum); +} + +/* + * tbm_lossify - lose some information to get back under the memory limit + */ +static void +tbm_lossify(TIDBitmap *tbm) +{ + pagetable_iterator i; + PagetableEntry *page; + + /* + * XXX Really stupid implementation: this just lossifies pages in + * essentially random order. We should be paying some attention to the + * number of bits set in each page, instead. + * + * Since we are called as soon as nentries exceeds maxentries, we should + * push nentries down to significantly less than maxentries, or else we'll + * just end up doing this again very soon. We shoot for maxentries/2. + */ + Assert(tbm->iterating == TBM_NOT_ITERATING); + Assert(tbm->status == TBM_HASH); + + pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start); + while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) + { + if (page->ischunk) + continue; /* already a chunk header */ + + /* + * If the page would become a chunk header, we won't save anything by + * converting it to lossy, so skip it. + */ + if ((page->blockno % PAGES_PER_CHUNK) == 0) + continue; + + /* This does the dirty work ... */ + tbm_mark_page_lossy(tbm, page->blockno); + + if (tbm->nentries <= tbm->maxentries / 2) + { + /* + * We have made enough room. Remember where to start lossifying + * next round, so we evenly iterate over the hashtable. + */ + tbm->lossify_start = i.cur; + break; + } + + /* + * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the + * hashtable and may have deleted the non-lossy chunk. We can + * continue the same hash table scan, since failure to visit one + * element or visiting the newly inserted element, isn't fatal. + */ + } + + /* + * With a big bitmap and small work_mem, it's possible that we cannot get + * under maxentries. Again, if that happens, we'd end up uselessly + * calling tbm_lossify over and over. To prevent this from becoming a + * performance sink, force maxentries up to at least double the current + * number of entries. (In essence, we're admitting inability to fit + * within work_mem when we do this.) Note that this test will not fire if + * we broke out of the loop early; and if we didn't, the current number of + * entries is simply not reducible any further. + */ + if (tbm->nentries > tbm->maxentries / 2) + tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2; +} + +/* + * qsort comparator to handle PagetableEntry pointers. + */ +static int +tbm_comparator(const void *left, const void *right) +{ + BlockNumber l = (*((PagetableEntry *const *) left))->blockno; + BlockNumber r = (*((PagetableEntry *const *) right))->blockno; + + if (l < r) + return -1; + else if (l > r) + return 1; + return 0; +} + +/* + * As above, but this will get index into PagetableEntry array. Therefore, + * it needs to get actual PagetableEntry using the index before comparing the + * blockno. + */ +static int +tbm_shared_comparator(const void *left, const void *right, void *arg) +{ + PagetableEntry *base = (PagetableEntry *) arg; + PagetableEntry *lpage = &base[*(int *) left]; + PagetableEntry *rpage = &base[*(int *) right]; + + if (lpage->blockno < rpage->blockno) + return -1; + else if (lpage->blockno > rpage->blockno) + return 1; + return 0; +} + +/* + * tbm_attach_shared_iterate + * + * Allocate a backend-private iterator and attach the shared iterator state + * to it so that multiple processed can iterate jointly. + * + * We also converts the DSA pointers to local pointers and store them into + * our private iterator. + */ +TBMSharedIterator * +tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp) +{ + TBMSharedIterator *iterator; + TBMSharedIteratorState *istate; + + /* + * Create the TBMSharedIterator struct, with enough trailing space to + * serve the needs of the TBMIterateResult sub-struct. + */ + iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) + + MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + + istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp); + + iterator->state = istate; + + iterator->ptbase = dsa_get_address(dsa, istate->pagetable); + + if (istate->npages) + iterator->ptpages = dsa_get_address(dsa, istate->spages); + if (istate->nchunks) + iterator->ptchunks = dsa_get_address(dsa, istate->schunks); + + return iterator; +} + +/* + * pagetable_allocate + * + * Callback function for allocating the memory for hashtable elements. + * Allocate memory for hashtable elements, using DSA if available. + */ +static inline void * +pagetable_allocate(pagetable_hash *pagetable, Size size) +{ + TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; + PTEntryArray *ptbase; + + if (tbm->dsa == NULL) + return MemoryContextAllocExtended(pagetable->ctx, size, + MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO); + + /* + * Save the dsapagetable reference in dsapagetableold before allocating + * new memory so that pagetable_free can free the old entry. + */ + tbm->dsapagetableold = tbm->dsapagetable; + tbm->dsapagetable = dsa_allocate_extended(tbm->dsa, + sizeof(PTEntryArray) + size, + DSA_ALLOC_HUGE | DSA_ALLOC_ZERO); + ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); + + return ptbase->ptentry; +} + +/* + * pagetable_free + * + * Callback function for freeing hash table elements. + */ +static inline void +pagetable_free(pagetable_hash *pagetable, void *pointer) +{ + TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; + + /* pfree the input pointer if DSA is not available */ + if (tbm->dsa == NULL) + pfree(pointer); + else if (DsaPointerIsValid(tbm->dsapagetableold)) + { + dsa_free(tbm->dsa, tbm->dsapagetableold); + tbm->dsapagetableold = InvalidDsaPointer; + } +} + +/* + * tbm_calculate_entries + * + * Estimate number of hashtable entries we can have within maxbytes. + */ +long +tbm_calculate_entries(double maxbytes) +{ + long nbuckets; + + /* + * Estimate number of hashtable entries we can have within maxbytes. This + * estimates the hash cost as sizeof(PagetableEntry), which is good enough + * for our purpose. Also count an extra Pointer per entry for the arrays + * created during iteration readout. + */ + nbuckets = maxbytes / + (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer)); + nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ + nbuckets = Max(nbuckets, 16); /* sanity limit */ + + return nbuckets; +} |