diff options
Diffstat (limited to 'src/backend/access/gin/ginpostinglist.c')
-rw-r--r-- | src/backend/access/gin/ginpostinglist.c | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c new file mode 100644 index 0000000..66a8983 --- /dev/null +++ b/src/backend/access/gin/ginpostinglist.c @@ -0,0 +1,434 @@ +/*------------------------------------------------------------------------- + * + * ginpostinglist.c + * routines for dealing with posting lists. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginpostinglist.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" + +#ifdef USE_ASSERT_CHECKING +#define CHECK_ENCODING_ROUNDTRIP +#endif + +/* + * For encoding purposes, item pointers are represented as 64-bit unsigned + * integers. The lowest 11 bits represent the offset number, and the next + * lowest 32 bits are the block number. That leaves 21 bits unused, i.e. + * only 43 low bits are used. + * + * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage < + * 2^11 on all supported block sizes. We are frugal with the bits, because + * smaller integers use fewer bytes in the varbyte encoding, saving disk + * space. (If we get a new table AM in the future that wants to use the full + * range of possible offset numbers, we'll need to change this.) + * + * These 43-bit integers are encoded using varbyte encoding. In each byte, + * the 7 low bits contain data, while the highest bit is a continuation bit. + * When the continuation bit is set, the next byte is part of the same + * integer, otherwise this is the last byte of this integer. 43 bits need + * at most 7 bytes in this encoding: + * + * 0XXXXXXX + * 1XXXXXXX 0XXXXYYY + * 1XXXXXXX 1XXXXYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY + * + * X = bits used for offset number + * Y = bits used for block number + * u = unused bit + * + * The bytes are in stored in little-endian order. + * + * An important property of this encoding is that removing an item from list + * never increases the size of the resulting compressed posting list. Proof: + * + * Removing number is actually replacement of two numbers with their sum. We + * have to prove that varbyte encoding of a sum can't be longer than varbyte + * encoding of its summands. Sum of two numbers is at most one bit wider than + * the larger of the summands. Widening a number by one bit enlarges its length + * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum + * is at most one byte longer than varbyte encoding of larger summand. Lesser + * summand is at least one byte, so the sum cannot take more space than the + * summands, Q.E.D. + * + * This property greatly simplifies VACUUM, which can assume that posting + * lists always fit on the same page after vacuuming. Note that even though + * that holds for removing items from a posting list, you must also be + * careful to not cause expansion e.g. when merging uncompressed items on the + * page into the compressed lists, when vacuuming. + */ + +/* + * How many bits do you need to encode offset number? OffsetNumber is a 16-bit + * integer, but you can't fit that many items on a page. 11 ought to be more + * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and + * use the minimum number of bits, but that would require changing the on-disk + * format if MaxHeapTuplesPerPage changes. Better to leave some slack. + */ +#define MaxHeapTuplesPerPageBits 11 + +/* Max. number of bytes needed to encode the largest supported integer. */ +#define MaxBytesPerInteger 7 + +static inline uint64 +itemptr_to_uint64(const ItemPointer iptr) +{ + uint64 val; + + Assert(ItemPointerIsValid(iptr)); + Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits)); + + val = GinItemPointerGetBlockNumber(iptr); + val <<= MaxHeapTuplesPerPageBits; + val |= GinItemPointerGetOffsetNumber(iptr); + + return val; +} + +static inline void +uint64_to_itemptr(uint64 val, ItemPointer iptr) +{ + GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1)); + val = val >> MaxHeapTuplesPerPageBits; + GinItemPointerSetBlockNumber(iptr, val); + + Assert(ItemPointerIsValid(iptr)); +} + +/* + * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer. + */ +static void +encode_varbyte(uint64 val, unsigned char **ptr) +{ + unsigned char *p = *ptr; + + while (val > 0x7F) + { + *(p++) = 0x80 | (val & 0x7F); + val >>= 7; + } + *(p++) = (unsigned char) val; + + *ptr = p; +} + +/* + * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer. + */ +static uint64 +decode_varbyte(unsigned char **ptr) +{ + uint64 val; + unsigned char *p = *ptr; + uint64 c; + + /* 1st byte */ + c = *(p++); + val = c & 0x7F; + if (c & 0x80) + { + /* 2nd byte */ + c = *(p++); + val |= (c & 0x7F) << 7; + if (c & 0x80) + { + /* 3rd byte */ + c = *(p++); + val |= (c & 0x7F) << 14; + if (c & 0x80) + { + /* 4th byte */ + c = *(p++); + val |= (c & 0x7F) << 21; + if (c & 0x80) + { + /* 5th byte */ + c = *(p++); + val |= (c & 0x7F) << 28; + if (c & 0x80) + { + /* 6th byte */ + c = *(p++); + val |= (c & 0x7F) << 35; + if (c & 0x80) + { + /* 7th byte, should not have continuation bit */ + c = *(p++); + val |= c << 42; + Assert((c & 0x80) == 0); + } + } + } + } + } + } + + *ptr = p; + + return val; +} + +/* + * Encode a posting list. + * + * The encoded list is returned in a palloc'd struct, which will be at most + * 'maxsize' bytes in size. The number items in the returned segment is + * returned in *nwritten. If it's not equal to nipd, not all the items fit + * in 'maxsize', and only the first *nwritten were encoded. + * + * The allocated size of the returned struct is short-aligned, and the padding + * byte at the end, if any, is zero. + */ +GinPostingList * +ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, + int *nwritten) +{ + uint64 prev; + int totalpacked = 0; + int maxbytes; + GinPostingList *result; + unsigned char *ptr; + unsigned char *endptr; + + maxsize = SHORTALIGN_DOWN(maxsize); + + result = palloc(maxsize); + + maxbytes = maxsize - offsetof(GinPostingList, bytes); + Assert(maxbytes > 0); + + /* Store the first special item */ + result->first = ipd[0]; + + prev = itemptr_to_uint64(&result->first); + + ptr = result->bytes; + endptr = result->bytes + maxbytes; + for (totalpacked = 1; totalpacked < nipd; totalpacked++) + { + uint64 val = itemptr_to_uint64(&ipd[totalpacked]); + uint64 delta = val - prev; + + Assert(val > prev); + + if (endptr - ptr >= MaxBytesPerInteger) + encode_varbyte(delta, &ptr); + else + { + /* + * There are less than 7 bytes left. Have to check if the next + * item fits in that space before writing it out. + */ + unsigned char buf[MaxBytesPerInteger]; + unsigned char *p = buf; + + encode_varbyte(delta, &p); + if (p - buf > (endptr - ptr)) + break; /* output is full */ + + memcpy(ptr, buf, p - buf); + ptr += (p - buf); + } + prev = val; + } + result->nbytes = ptr - result->bytes; + + /* + * If we wrote an odd number of bytes, zero out the padding byte at the + * end. + */ + if (result->nbytes != SHORTALIGN(result->nbytes)) + result->bytes[result->nbytes] = 0; + + if (nwritten) + *nwritten = totalpacked; + + Assert(SizeOfGinPostingList(result) <= maxsize); + + /* + * Check that the encoded segment decodes back to the original items. + */ +#if defined (CHECK_ENCODING_ROUNDTRIP) + { + int ndecoded; + ItemPointer tmp = ginPostingListDecode(result, &ndecoded); + + Assert(ndecoded == totalpacked); + Assert(memcmp(tmp, ipd, ndecoded * sizeof(ItemPointerData)) == 0); + pfree(tmp); + } +#endif + + return result; +} + +/* + * Decode a compressed posting list into an array of item pointers. + * The number of items is returned in *ndecoded. + */ +ItemPointer +ginPostingListDecode(GinPostingList *plist, int *ndecoded_out) +{ + return ginPostingListDecodeAllSegments(plist, + SizeOfGinPostingList(plist), + ndecoded_out); +} + +/* + * Decode multiple posting list segments into an array of item pointers. + * The number of items is returned in *ndecoded_out. The segments are stored + * one after each other, with total size 'len' bytes. + */ +ItemPointer +ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out) +{ + ItemPointer result; + int nallocated; + uint64 val; + char *endseg = ((char *) segment) + len; + int ndecoded; + unsigned char *ptr; + unsigned char *endptr; + + /* + * Guess an initial size of the array. + */ + nallocated = segment->nbytes * 2 + 1; + result = palloc(nallocated * sizeof(ItemPointerData)); + + ndecoded = 0; + while ((char *) segment < endseg) + { + /* enlarge output array if needed */ + if (ndecoded >= nallocated) + { + nallocated *= 2; + result = repalloc(result, nallocated * sizeof(ItemPointerData)); + } + + /* copy the first item */ + Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first))); + Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0); + result[ndecoded] = segment->first; + ndecoded++; + + val = itemptr_to_uint64(&segment->first); + ptr = segment->bytes; + endptr = segment->bytes + segment->nbytes; + while (ptr < endptr) + { + /* enlarge output array if needed */ + if (ndecoded >= nallocated) + { + nallocated *= 2; + result = repalloc(result, nallocated * sizeof(ItemPointerData)); + } + + val += decode_varbyte(&ptr); + + uint64_to_itemptr(val, &result[ndecoded]); + ndecoded++; + } + segment = GinNextPostingListSegment(segment); + } + + if (ndecoded_out) + *ndecoded_out = ndecoded; + return result; +} + +/* + * Add all item pointers from a bunch of posting lists to a TIDBitmap. + */ +int +ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, + TIDBitmap *tbm) +{ + int ndecoded; + ItemPointer items; + + items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded); + tbm_add_tuples(tbm, items, ndecoded, false); + pfree(items); + + return ndecoded; +} + +/* + * Merge two ordered arrays of itempointers, eliminating any duplicates. + * + * Returns a palloc'd array, and *nmerged is set to the number of items in + * the result, after eliminating duplicates. + */ +ItemPointer +ginMergeItemPointers(ItemPointerData *a, uint32 na, + ItemPointerData *b, uint32 nb, + int *nmerged) +{ + ItemPointerData *dst; + + dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData)); + + /* + * If the argument arrays don't overlap, we can just append them to each + * other. + */ + if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0) + { + memcpy(dst, a, na * sizeof(ItemPointerData)); + memcpy(&dst[na], b, nb * sizeof(ItemPointerData)); + *nmerged = na + nb; + } + else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0) + { + memcpy(dst, b, nb * sizeof(ItemPointerData)); + memcpy(&dst[nb], a, na * sizeof(ItemPointerData)); + *nmerged = na + nb; + } + else + { + ItemPointerData *dptr = dst; + ItemPointerData *aptr = a; + ItemPointerData *bptr = b; + + while (aptr - a < na && bptr - b < nb) + { + int cmp = ginCompareItemPointers(aptr, bptr); + + if (cmp > 0) + *dptr++ = *bptr++; + else if (cmp == 0) + { + /* only keep one copy of the identical items */ + *dptr++ = *bptr++; + aptr++; + } + else + *dptr++ = *aptr++; + } + + while (aptr - a < na) + *dptr++ = *aptr++; + + while (bptr - b < nb) + *dptr++ = *bptr++; + + *nmerged = dptr - dst; + } + + return dst; +} |