diff options
Diffstat (limited to 'src/backend/access/brin/brin_tuple.c')
-rw-r--r-- | src/backend/access/brin/brin_tuple.c | 708 |
1 files changed, 708 insertions, 0 deletions
diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c new file mode 100644 index 0000000..09e563b --- /dev/null +++ b/src/backend/access/brin/brin_tuple.c @@ -0,0 +1,708 @@ +/* + * brin_tuple.c + * Method implementations for tuples in BRIN indexes. + * + * Intended usage is that code outside this file only deals with + * BrinMemTuples, and convert to and from the on-disk representation through + * functions in this file. + * + * NOTES + * + * A BRIN tuple is similar to a heap tuple, with a few key differences. The + * first interesting difference is that the tuple header is much simpler, only + * containing its total length and a small area for flags. Also, the stored + * data does not match the relation tuple descriptor exactly: for each + * attribute in the descriptor, the index tuple carries an arbitrary number + * of values, depending on the opclass. + * + * Also, for each column of the index relation there are two null bits: one + * (hasnulls) stores whether any tuple within the page range has that column + * set to null; the other one (allnulls) stores whether the column values are + * all null. If allnulls is true, then the tuple data area does not contain + * values for that column at all; whereas it does if the hasnulls is set. + * Note the size of the null bitmask may not be the same as that of the + * datum array. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_tuple.c + */ +#include "postgres.h" + +#include "access/brin_tuple.h" +#include "access/detoast.h" +#include "access/heaptoast.h" +#include "access/htup_details.h" +#include "access/toast_internals.h" +#include "access/tupdesc.h" +#include "access/tupmacs.h" +#include "utils/datum.h" +#include "utils/memutils.h" + + +/* + * This enables de-toasting of index entries. Needed until VACUUM is + * smart enough to rebuild indexes from scratch. + */ +#define TOAST_INDEX_HACK + + +static inline void brin_deconstruct_tuple(BrinDesc *brdesc, + char *tp, bits8 *nullbits, bool nulls, + Datum *values, bool *allnulls, bool *hasnulls); + + +/* + * Return a tuple descriptor used for on-disk storage of BRIN tuples. + */ +static TupleDesc +brtuple_disk_tupdesc(BrinDesc *brdesc) +{ + /* We cache these in the BrinDesc */ + if (brdesc->bd_disktdesc == NULL) + { + int i; + int j; + AttrNumber attno = 1; + TupleDesc tupdesc; + MemoryContext oldcxt; + + /* make sure it's in the bdesc's context */ + oldcxt = MemoryContextSwitchTo(brdesc->bd_context); + + tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored); + + for (i = 0; i < brdesc->bd_tupdesc->natts; i++) + { + for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++) + TupleDescInitEntry(tupdesc, attno++, NULL, + brdesc->bd_info[i]->oi_typcache[j]->type_id, + -1, 0); + } + + MemoryContextSwitchTo(oldcxt); + + brdesc->bd_disktdesc = tupdesc; + } + + return brdesc->bd_disktdesc; +} + +/* + * Generate a new on-disk tuple to be inserted in a BRIN index. + * + * See brin_form_placeholder_tuple if you touch this. + */ +BrinTuple * +brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, + Size *size) +{ + Datum *values; + bool *nulls; + bool anynulls = false; + BrinTuple *rettuple; + int keyno; + int idxattno; + uint16 phony_infomask = 0; + bits8 *phony_nullbitmap; + Size len, + hoff, + data_len; + int i; + +#ifdef TOAST_INDEX_HACK + Datum *untoasted_values; + int nuntoasted = 0; +#endif + + Assert(brdesc->bd_totalstored > 0); + + values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored); + nulls = (bool *) palloc0(sizeof(bool) * brdesc->bd_totalstored); + phony_nullbitmap = (bits8 *) + palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored)); + +#ifdef TOAST_INDEX_HACK + untoasted_values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored); +#endif + + /* + * Set up the values/nulls arrays for heap_fill_tuple + */ + idxattno = 0; + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + int datumno; + + /* + * "allnulls" is set when there's no nonnull value in any row in the + * column; when this happens, there is no data to store. Thus set the + * nullable bits for all data elements of this column and we're done. + */ + if (tuple->bt_columns[keyno].bv_allnulls) + { + for (datumno = 0; + datumno < brdesc->bd_info[keyno]->oi_nstored; + datumno++) + nulls[idxattno++] = true; + anynulls = true; + continue; + } + + /* + * The "hasnulls" bit is set when there are some null values in the + * data. We still need to store a real value, but the presence of + * this means we need a null bitmap. + */ + if (tuple->bt_columns[keyno].bv_hasnulls) + anynulls = true; + + /* If needed, serialize the values before forming the on-disk tuple. */ + if (tuple->bt_columns[keyno].bv_serialize) + { + tuple->bt_columns[keyno].bv_serialize(brdesc, + tuple->bt_columns[keyno].bv_mem_value, + tuple->bt_columns[keyno].bv_values); + } + + /* + * Now obtain the values of each stored datum. Note that some values + * might be toasted, and we cannot rely on the original heap values + * sticking around forever, so we must detoast them. Also try to + * compress them. + */ + for (datumno = 0; + datumno < brdesc->bd_info[keyno]->oi_nstored; + datumno++) + { + Datum value = tuple->bt_columns[keyno].bv_values[datumno]; + +#ifdef TOAST_INDEX_HACK + + /* We must look at the stored type, not at the index descriptor. */ + TypeCacheEntry *atttype = brdesc->bd_info[keyno]->oi_typcache[datumno]; + + /* Do we need to free the value at the end? */ + bool free_value = false; + + /* For non-varlena types we don't need to do anything special */ + if (atttype->typlen != -1) + { + values[idxattno++] = value; + continue; + } + + /* + * Do nothing if value is not of varlena type. We don't need to + * care about NULL values here, thanks to bv_allnulls above. + * + * If value is stored EXTERNAL, must fetch it so we are not + * depending on outside storage. + * + * XXX Is this actually true? Could it be that the summary is NULL + * even for range with non-NULL data? E.g. degenerate bloom filter + * may be thrown away, etc. + */ + if (VARATT_IS_EXTERNAL(DatumGetPointer(value))) + { + value = PointerGetDatum(detoast_external_attr((struct varlena *) + DatumGetPointer(value))); + free_value = true; + } + + /* + * If value is above size target, and is of a compressible + * datatype, try to compress it in-line. + */ + if (!VARATT_IS_EXTENDED(DatumGetPointer(value)) && + VARSIZE(DatumGetPointer(value)) > TOAST_INDEX_TARGET && + (atttype->typstorage == TYPSTORAGE_EXTENDED || + atttype->typstorage == TYPSTORAGE_MAIN)) + { + Datum cvalue; + char compression; + Form_pg_attribute att = TupleDescAttr(brdesc->bd_tupdesc, + keyno); + + /* + * If the BRIN summary and indexed attribute use the same data + * type and it has a valid compression method, we can use the + * same compression method. Otherwise we have to use the + * default method. + */ + if (att->atttypid == atttype->type_id) + compression = att->attcompression; + else + compression = InvalidCompressionMethod; + + cvalue = toast_compress_datum(value, compression); + + if (DatumGetPointer(cvalue) != NULL) + { + /* successful compression */ + if (free_value) + pfree(DatumGetPointer(value)); + + value = cvalue; + free_value = true; + } + } + + /* + * If we untoasted / compressed the value, we need to free it + * after forming the index tuple. + */ + if (free_value) + untoasted_values[nuntoasted++] = value; + +#endif + + values[idxattno++] = value; + } + } + + /* Assert we did not overrun temp arrays */ + Assert(idxattno <= brdesc->bd_totalstored); + + /* compute total space needed */ + len = SizeOfBrinTuple; + if (anynulls) + { + /* + * We need a double-length bitmap on an on-disk BRIN index tuple; the + * first half stores the "allnulls" bits, the second stores + * "hasnulls". + */ + len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); + } + + len = hoff = MAXALIGN(len); + + data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc), + values, nulls); + len += data_len; + + len = MAXALIGN(len); + + rettuple = palloc0(len); + rettuple->bt_blkno = blkno; + rettuple->bt_info = hoff; + + /* Assert that hoff fits in the space available */ + Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff); + + /* + * The infomask and null bitmap as computed by heap_fill_tuple are useless + * to us. However, that function will not accept a null infomask; and we + * need to pass a valid null bitmap so that it will correctly skip + * outputting null attributes in the data area. + */ + heap_fill_tuple(brtuple_disk_tupdesc(brdesc), + values, + nulls, + (char *) rettuple + hoff, + data_len, + &phony_infomask, + phony_nullbitmap); + + /* done with these */ + pfree(values); + pfree(nulls); + pfree(phony_nullbitmap); + +#ifdef TOAST_INDEX_HACK + for (i = 0; i < nuntoasted; i++) + pfree(DatumGetPointer(untoasted_values[i])); +#endif + + /* + * Now fill in the real null bitmasks. allnulls first. + */ + if (anynulls) + { + bits8 *bitP; + int bitmask; + + rettuple->bt_info |= BRIN_NULLS_MASK; + + /* + * Note that we reverse the sense of null bits in this module: we + * store a 1 for a null attribute rather than a 0. So we must reverse + * the sense of the att_isnull test in brin_deconstruct_tuple as well. + */ + bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; + bitmask = HIGHBIT; + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + if (!tuple->bt_columns[keyno].bv_allnulls) + continue; + + *bitP |= bitmask; + } + /* hasnulls bits follow */ + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + if (!tuple->bt_columns[keyno].bv_hasnulls) + continue; + + *bitP |= bitmask; + } + } + + if (tuple->bt_placeholder) + rettuple->bt_info |= BRIN_PLACEHOLDER_MASK; + + *size = len; + return rettuple; +} + +/* + * Generate a new on-disk tuple with no data values, marked as placeholder. + * + * This is a cut-down version of brin_form_tuple. + */ +BrinTuple * +brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size) +{ + Size len; + Size hoff; + BrinTuple *rettuple; + int keyno; + bits8 *bitP; + int bitmask; + + /* compute total space needed: always add nulls */ + len = SizeOfBrinTuple; + len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); + len = hoff = MAXALIGN(len); + + rettuple = palloc0(len); + rettuple->bt_blkno = blkno; + rettuple->bt_info = hoff; + rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK; + + bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; + bitmask = HIGHBIT; + /* set allnulls true for all attributes */ + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + *bitP |= bitmask; + } + /* no need to set hasnulls */ + + *size = len; + return rettuple; +} + +/* + * Free a tuple created by brin_form_tuple + */ +void +brin_free_tuple(BrinTuple *tuple) +{ + pfree(tuple); +} + +/* + * Given a brin tuple of size len, create a copy of it. If 'dest' is not + * NULL, its size is destsz, and can be used as output buffer; if the tuple + * to be copied does not fit, it is enlarged by repalloc, and the size is + * updated to match. This avoids palloc/free cycles when many brin tuples + * are being processed in loops. + */ +BrinTuple * +brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz) +{ + if (!destsz || *destsz == 0) + dest = palloc(len); + else if (len > *destsz) + { + dest = repalloc(dest, len); + *destsz = len; + } + + memcpy(dest, tuple, len); + + return dest; +} + +/* + * Return whether two BrinTuples are bitwise identical. + */ +bool +brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen) +{ + if (alen != blen) + return false; + if (memcmp(a, b, alen) != 0) + return false; + return true; +} + +/* + * Create a new BrinMemTuple from scratch, and initialize it to an empty + * state. + * + * Note: we don't provide any means to free a deformed tuple, so make sure to + * use a temporary memory context. + */ +BrinMemTuple * +brin_new_memtuple(BrinDesc *brdesc) +{ + BrinMemTuple *dtup; + long basesize; + + basesize = MAXALIGN(sizeof(BrinMemTuple) + + sizeof(BrinValues) * brdesc->bd_tupdesc->natts); + dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored); + + dtup->bt_values = palloc(sizeof(Datum) * brdesc->bd_totalstored); + dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + + dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext, + "brin dtuple", + ALLOCSET_DEFAULT_SIZES); + + brin_memtuple_initialize(dtup, brdesc); + + return dtup; +} + +/* + * Reset a BrinMemTuple to initial state. We return the same tuple, for + * notational convenience. + */ +BrinMemTuple * +brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc) +{ + int i; + char *currdatum; + + MemoryContextReset(dtuple->bt_context); + + currdatum = (char *) dtuple + + MAXALIGN(sizeof(BrinMemTuple) + + sizeof(BrinValues) * brdesc->bd_tupdesc->natts); + for (i = 0; i < brdesc->bd_tupdesc->natts; i++) + { + dtuple->bt_columns[i].bv_attno = i + 1; + dtuple->bt_columns[i].bv_allnulls = true; + dtuple->bt_columns[i].bv_hasnulls = false; + dtuple->bt_columns[i].bv_values = (Datum *) currdatum; + + dtuple->bt_columns[i].bv_mem_value = PointerGetDatum(NULL); + dtuple->bt_columns[i].bv_serialize = NULL; + dtuple->bt_columns[i].bv_context = dtuple->bt_context; + + currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored; + } + + return dtuple; +} + +/* + * Convert a BrinTuple back to a BrinMemTuple. This is the reverse of + * brin_form_tuple. + * + * As an optimization, the caller can pass a previously allocated 'dMemtuple'. + * This avoids having to allocate it here, which can be useful when this + * function is called many times in a loop. It is caller's responsibility + * that the given BrinMemTuple matches what we need here. + * + * Note we don't need the "on disk tupdesc" here; we rely on our own routine to + * deconstruct the tuple from the on-disk format. + */ +BrinMemTuple * +brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple) +{ + BrinMemTuple *dtup; + Datum *values; + bool *allnulls; + bool *hasnulls; + char *tp; + bits8 *nullbits; + int keyno; + int valueno; + MemoryContext oldcxt; + + dtup = dMemtuple ? brin_memtuple_initialize(dMemtuple, brdesc) : + brin_new_memtuple(brdesc); + + if (BrinTupleIsPlaceholder(tuple)) + dtup->bt_placeholder = true; + dtup->bt_blkno = tuple->bt_blkno; + + values = dtup->bt_values; + allnulls = dtup->bt_allnulls; + hasnulls = dtup->bt_hasnulls; + + tp = (char *) tuple + BrinTupleDataOffset(tuple); + + if (BrinTupleHasNulls(tuple)) + nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple); + else + nullbits = NULL; + brin_deconstruct_tuple(brdesc, + tp, nullbits, BrinTupleHasNulls(tuple), + values, allnulls, hasnulls); + + /* + * Iterate to assign each of the values to the corresponding item in the + * values array of each column. The copies occur in the tuple's context. + */ + oldcxt = MemoryContextSwitchTo(dtup->bt_context); + for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + int i; + + if (allnulls[keyno]) + { + valueno += brdesc->bd_info[keyno]->oi_nstored; + continue; + } + + /* + * We would like to skip datumCopy'ing the values datum in some cases, + * caller permitting ... + */ + for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++) + dtup->bt_columns[keyno].bv_values[i] = + datumCopy(values[valueno++], + brdesc->bd_info[keyno]->oi_typcache[i]->typbyval, + brdesc->bd_info[keyno]->oi_typcache[i]->typlen); + + dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno]; + dtup->bt_columns[keyno].bv_allnulls = false; + + dtup->bt_columns[keyno].bv_mem_value = PointerGetDatum(NULL); + dtup->bt_columns[keyno].bv_serialize = NULL; + dtup->bt_columns[keyno].bv_context = dtup->bt_context; + } + + MemoryContextSwitchTo(oldcxt); + + return dtup; +} + +/* + * brin_deconstruct_tuple + * Guts of attribute extraction from an on-disk BRIN tuple. + * + * Its arguments are: + * brdesc BRIN descriptor for the stored tuple + * tp pointer to the tuple data area + * nullbits pointer to the tuple nulls bitmask + * nulls "has nulls" bit in tuple infomask + * values output values, array of size brdesc->bd_totalstored + * allnulls output "allnulls", size brdesc->bd_tupdesc->natts + * hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts + * + * Output arrays must have been allocated by caller. + */ +static inline void +brin_deconstruct_tuple(BrinDesc *brdesc, + char *tp, bits8 *nullbits, bool nulls, + Datum *values, bool *allnulls, bool *hasnulls) +{ + int attnum; + int stored; + TupleDesc diskdsc; + long off; + + /* + * First iterate to natts to obtain both null flags for each attribute. + * Note that we reverse the sense of the att_isnull test, because we store + * 1 for a null value (rather than a 1 for a not null value as is the + * att_isnull convention used elsewhere.) See brin_form_tuple. + */ + for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) + { + /* + * the "all nulls" bit means that all values in the page range for + * this column are nulls. Therefore there are no values in the tuple + * data area. + */ + allnulls[attnum] = nulls && !att_isnull(attnum, nullbits); + + /* + * the "has nulls" bit means that some tuples have nulls, but others + * have not-null values. Therefore we know the tuple contains data + * for this column. + * + * The hasnulls bits follow the allnulls bits in the same bitmask. + */ + hasnulls[attnum] = + nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits); + } + + /* + * Iterate to obtain each attribute's stored values. Note that since we + * may reuse attribute entries for more than one column, we cannot cache + * offsets here. + */ + diskdsc = brtuple_disk_tupdesc(brdesc); + stored = 0; + off = 0; + for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) + { + int datumno; + + if (allnulls[attnum]) + { + stored += brdesc->bd_info[attnum]->oi_nstored; + continue; + } + + for (datumno = 0; + datumno < brdesc->bd_info[attnum]->oi_nstored; + datumno++) + { + Form_pg_attribute thisatt = TupleDescAttr(diskdsc, stored); + + if (thisatt->attlen == -1) + { + off = att_align_pointer(off, thisatt->attalign, -1, + tp + off); + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, thisatt->attalign); + } + + values[stored++] = fetchatt(thisatt, tp + off); + + off = att_addlength_pointer(off, thisatt->attlen, tp + off); + } + } +} |