/*------------------------------------------------------------------------- * * toast_internals.c * Functions for internal use by the TOAST system. * * Copyright (c) 2000-2023, PostgreSQL Global Development Group * * IDENTIFICATION * src/backend/access/common/toast_internals.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/detoast.h" #include "access/genam.h" #include "access/heapam.h" #include "access/heaptoast.h" #include "access/table.h" #include "access/toast_internals.h" #include "access/xact.h" #include "catalog/catalog.h" #include "common/pg_lzcompress.h" #include "miscadmin.h" #include "utils/fmgroids.h" #include "utils/rel.h" #include "utils/snapmgr.h" static bool toastrel_valueid_exists(Relation toastrel, Oid valueid); static bool toastid_valueid_exists(Oid toastrelid, Oid valueid); /* ---------- * toast_compress_datum - * * Create a compressed version of a varlena datum * * If we fail (ie, compressed result is actually bigger than original) * then return NULL. We must not use compressed data if it'd expand * the tuple! * * We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without * copying them. But we can't handle external or compressed datums. * ---------- */ Datum toast_compress_datum(Datum value, char cmethod) { struct varlena *tmp = NULL; int32 valsize; ToastCompressionId cmid = TOAST_INVALID_COMPRESSION_ID; Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value))); Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value)); /* If the compression method is not valid, use the current default */ if (!CompressionMethodIsValid(cmethod)) cmethod = default_toast_compression; /* * Call appropriate compression routine for the compression method. */ switch (cmethod) { case TOAST_PGLZ_COMPRESSION: tmp = pglz_compress_datum((const struct varlena *) value); cmid = TOAST_PGLZ_COMPRESSION_ID; break; case TOAST_LZ4_COMPRESSION: tmp = lz4_compress_datum((const struct varlena *) value); cmid = TOAST_LZ4_COMPRESSION_ID; break; default: elog(ERROR, "invalid compression method %c", cmethod); } if (tmp == NULL) return PointerGetDatum(NULL); /* * We recheck the actual size even if compression reports success, because * it might be satisfied with having saved as little as one byte in the * compressed data --- which could turn into a net loss once you consider * header and alignment padding. Worst case, the compressed format might * require three padding bytes (plus header, which is included in * VARSIZE(tmp)), whereas the uncompressed format would take only one * header byte and no padding if the value is short enough. So we insist * on a savings of more than 2 bytes to ensure we have a gain. */ if (VARSIZE(tmp) < valsize - 2) { /* successful compression */ Assert(cmid != TOAST_INVALID_COMPRESSION_ID); TOAST_COMPRESS_SET_SIZE_AND_COMPRESS_METHOD(tmp, valsize, cmid); return PointerGetDatum(tmp); } else { /* incompressible data */ pfree(tmp); return PointerGetDatum(NULL); } } /* ---------- * toast_save_datum - * * Save one single datum into the secondary relation and return * a Datum reference for it. * * rel: the main relation we're working with (not the toast rel!) * value: datum to be pushed to toast storage * oldexternal: if not NULL, toast pointer previously representing the datum * options: options to be passed to heap_insert() for toast rows * ---------- */ Datum toast_save_datum(Relation rel, Datum value, struct varlena *oldexternal, int options) { Relation toastrel; Relation *toastidxs; HeapTuple toasttup; TupleDesc toasttupDesc; Datum t_values[3]; bool t_isnull[3]; CommandId mycid = GetCurrentCommandId(true); struct varlena *result; struct varatt_external toast_pointer; union { struct varlena hdr; /* this is to make the union big enough for a chunk: */ char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; /* ensure union is aligned well enough: */ int32 align_it; } chunk_data; int32 chunk_size; int32 chunk_seq = 0; char *data_p; int32 data_todo; Pointer dval = DatumGetPointer(value); int num_indexes; int validIndex; Assert(!VARATT_IS_EXTERNAL(value)); /* * Open the toast relation and its indexes. We can use the index to check * uniqueness of the OID we assign to the toasted item, even though it has * additional columns besides OID. */ toastrel = table_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); toasttupDesc = toastrel->rd_att; /* Open all the toast indexes and look for the valid one */ validIndex = toast_open_indexes(toastrel, RowExclusiveLock, &toastidxs, &num_indexes); /* * Get the data pointer and length, and compute va_rawsize and va_extinfo. * * va_rawsize is the size of the equivalent fully uncompressed datum, so * we have to adjust for short headers. * * va_extinfo stored the actual size of the data payload in the toast * records and the compression method in first 2 bits if data is * compressed. */ if (VARATT_IS_SHORT(dval)) { data_p = VARDATA_SHORT(dval); data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */ toast_pointer.va_extinfo = data_todo; } else if (VARATT_IS_COMPRESSED(dval)) { data_p = VARDATA(dval); data_todo = VARSIZE(dval) - VARHDRSZ; /* rawsize in a compressed datum is just the size of the payload */ toast_pointer.va_rawsize = VARDATA_COMPRESSED_GET_EXTSIZE(dval) + VARHDRSZ; /* set external size and compression method */ VARATT_EXTERNAL_SET_SIZE_AND_COMPRESS_METHOD(toast_pointer, data_todo, VARDATA_COMPRESSED_GET_COMPRESS_METHOD(dval)); /* Assert that the numbers look like it's compressed */ Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); } else { data_p = VARDATA(dval); data_todo = VARSIZE(dval) - VARHDRSZ; toast_pointer.va_rawsize = VARSIZE(dval); toast_pointer.va_extinfo = data_todo; } /* * Insert the correct table OID into the result TOAST pointer. * * Normally this is the actual OID of the target toast table, but during * table-rewriting operations such as CLUSTER, we have to insert the OID * of the table's real permanent toast table instead. rd_toastoid is set * if we have to substitute such an OID. */ if (OidIsValid(rel->rd_toastoid)) toast_pointer.va_toastrelid = rel->rd_toastoid; else toast_pointer.va_toastrelid = RelationGetRelid(toastrel); /* * Choose an OID to use as the value ID for this toast value. * * Normally we just choose an unused OID within the toast table. But * during table-rewriting operations where we are preserving an existing * toast table OID, we want to preserve toast value OIDs too. So, if * rd_toastoid is set and we had a prior external value from that same * toast table, re-use its value ID. If we didn't have a prior external * value (which is a corner case, but possible if the table's attstorage * options have been changed), we have to pick a value ID that doesn't * conflict with either new or existing toast value OIDs. */ if (!OidIsValid(rel->rd_toastoid)) { /* normal case: just choose an unused OID */ toast_pointer.va_valueid = GetNewOidWithIndex(toastrel, RelationGetRelid(toastidxs[validIndex]), (AttrNumber) 1); } else { /* rewrite case: check to see if value was in old toast table */ toast_pointer.va_valueid = InvalidOid; if (oldexternal != NULL) { struct varatt_external old_toast_pointer; Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) { /* This value came from the old toast table; reuse its OID */ toast_pointer.va_valueid = old_toast_pointer.va_valueid; /* * There is a corner case here: the table rewrite might have * to copy both live and recently-dead versions of a row, and * those versions could easily reference the same toast value. * When we copy the second or later version of such a row, * reusing the OID will mean we select an OID that's already * in the new toast table. Check for that, and if so, just * fall through without writing the data again. * * While annoying and ugly-looking, this is a good thing * because it ensures that we wind up with only one copy of * the toast value when there is only one copy in the old * toast table. Before we detected this case, we'd have made * multiple copies, wasting space; and what's worse, the * copies belonging to already-deleted heap tuples would not * be reclaimed by VACUUM. */ if (toastrel_valueid_exists(toastrel, toast_pointer.va_valueid)) { /* Match, so short-circuit the data storage loop below */ data_todo = 0; } } } if (toast_pointer.va_valueid == InvalidOid) { /* * new value; must choose an OID that doesn't conflict in either * old or new toast table */ do { toast_pointer.va_valueid = GetNewOidWithIndex(toastrel, RelationGetRelid(toastidxs[validIndex]), (AttrNumber) 1); } while (toastid_valueid_exists(rel->rd_toastoid, toast_pointer.va_valueid)); } } /* * Initialize constant parts of the tuple data */ t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); t_values[2] = PointerGetDatum(&chunk_data); t_isnull[0] = false; t_isnull[1] = false; t_isnull[2] = false; /* * Split up the item into chunks */ while (data_todo > 0) { int i; CHECK_FOR_INTERRUPTS(); /* * Calculate the size of this chunk */ chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); /* * Build a tuple and store it */ t_values[1] = Int32GetDatum(chunk_seq++); SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); memcpy(VARDATA(&chunk_data), data_p, chunk_size); toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull); heap_insert(toastrel, toasttup, mycid, options, NULL); /* * Create the index entry. We cheat a little here by not using * FormIndexDatum: this relies on the knowledge that the index columns * are the same as the initial columns of the table for all the * indexes. We also cheat by not providing an IndexInfo: this is okay * for now because btree doesn't need one, but we might have to be * more honest someday. * * Note also that there had better not be any user-created index on * the TOAST table, since we don't bother to update anything else. */ for (i = 0; i < num_indexes; i++) { /* Only index relations marked as ready can be updated */ if (toastidxs[i]->rd_index->indisready) index_insert(toastidxs[i], t_values, t_isnull, &(toasttup->t_self), toastrel, toastidxs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, false, NULL); } /* * Free memory */ heap_freetuple(toasttup); /* * Move on to next chunk */ data_todo -= chunk_size; data_p += chunk_size; } /* * Done - close toast relation and its indexes but keep the lock until * commit, so as a concurrent reindex done directly on the toast relation * would be able to wait for this transaction. */ toast_close_indexes(toastidxs, num_indexes, NoLock); table_close(toastrel, NoLock); /* * Create the TOAST pointer value that we'll return */ result = (struct varlena *) palloc(TOAST_POINTER_SIZE); SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK); memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); return PointerGetDatum(result); } /* ---------- * toast_delete_datum - * * Delete a single external stored value. * ---------- */ void toast_delete_datum(Relation rel, Datum value, bool is_speculative) { struct varlena *attr = (struct varlena *) DatumGetPointer(value); struct varatt_external toast_pointer; Relation toastrel; Relation *toastidxs; ScanKeyData toastkey; SysScanDesc toastscan; HeapTuple toasttup; int num_indexes; int validIndex; SnapshotData SnapshotToast; if (!VARATT_IS_EXTERNAL_ONDISK(attr)) return; /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); /* * Open the toast relation and its indexes */ toastrel = table_open(toast_pointer.va_toastrelid, RowExclusiveLock); /* Fetch valid relation used for process */ validIndex = toast_open_indexes(toastrel, RowExclusiveLock, &toastidxs, &num_indexes); /* * Setup a scan key to find chunks with matching va_valueid */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(toast_pointer.va_valueid)); /* * Find all the chunks. (We don't actually care whether we see them in * sequence or not, but since we've already locked the index we might as * well use systable_beginscan_ordered.) */ init_toast_snapshot(&SnapshotToast); toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], &SnapshotToast, 1, &toastkey); while ((toasttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, delete it */ if (is_speculative) heap_abort_speculative(toastrel, &toasttup->t_self); else simple_heap_delete(toastrel, &toasttup->t_self); } /* * End scan and close relations but keep the lock until commit, so as a * concurrent reindex done directly on the toast relation would be able to * wait for this transaction. */ systable_endscan_ordered(toastscan); toast_close_indexes(toastidxs, num_indexes, NoLock); table_close(toastrel, NoLock); } /* ---------- * toastrel_valueid_exists - * * Test whether a toast value with the given ID exists in the toast relation. * For safety, we consider a value to exist if there are either live or dead * toast rows with that ID; see notes for GetNewOidWithIndex(). * ---------- */ static bool toastrel_valueid_exists(Relation toastrel, Oid valueid) { bool result = false; ScanKeyData toastkey; SysScanDesc toastscan; int num_indexes; int validIndex; Relation *toastidxs; /* Fetch a valid index relation */ validIndex = toast_open_indexes(toastrel, RowExclusiveLock, &toastidxs, &num_indexes); /* * Setup a scan key to find chunks with matching va_valueid */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(valueid)); /* * Is there any such chunk? */ toastscan = systable_beginscan(toastrel, RelationGetRelid(toastidxs[validIndex]), true, SnapshotAny, 1, &toastkey); if (systable_getnext(toastscan) != NULL) result = true; systable_endscan(toastscan); /* Clean up */ toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); return result; } /* ---------- * toastid_valueid_exists - * * As above, but work from toast rel's OID not an open relation * ---------- */ static bool toastid_valueid_exists(Oid toastrelid, Oid valueid) { bool result; Relation toastrel; toastrel = table_open(toastrelid, AccessShareLock); result = toastrel_valueid_exists(toastrel, valueid); table_close(toastrel, AccessShareLock); return result; } /* ---------- * toast_get_valid_index * * Get OID of valid index associated to given toast relation. A toast * relation can have only one valid index at the same time. */ Oid toast_get_valid_index(Oid toastoid, LOCKMODE lock) { int num_indexes; int validIndex; Oid validIndexOid; Relation *toastidxs; Relation toastrel; /* Open the toast relation */ toastrel = table_open(toastoid, lock); /* Look for the valid index of the toast relation */ validIndex = toast_open_indexes(toastrel, lock, &toastidxs, &num_indexes); validIndexOid = RelationGetRelid(toastidxs[validIndex]); /* Close the toast relation and all its indexes */ toast_close_indexes(toastidxs, num_indexes, NoLock); table_close(toastrel, NoLock); return validIndexOid; } /* ---------- * toast_open_indexes * * Get an array of the indexes associated to the given toast relation * and return as well the position of the valid index used by the toast * relation in this array. It is the responsibility of the caller of this * function to close the indexes as well as free them. */ int toast_open_indexes(Relation toastrel, LOCKMODE lock, Relation **toastidxs, int *num_indexes) { int i = 0; int res = 0; bool found = false; List *indexlist; ListCell *lc; /* Get index list of the toast relation */ indexlist = RelationGetIndexList(toastrel); Assert(indexlist != NIL); *num_indexes = list_length(indexlist); /* Open all the index relations */ *toastidxs = (Relation *) palloc(*num_indexes * sizeof(Relation)); foreach(lc, indexlist) (*toastidxs)[i++] = index_open(lfirst_oid(lc), lock); /* Fetch the first valid index in list */ for (i = 0; i < *num_indexes; i++) { Relation toastidx = (*toastidxs)[i]; if (toastidx->rd_index->indisvalid) { res = i; found = true; break; } } /* * Free index list, not necessary anymore as relations are opened and a * valid index has been found. */ list_free(indexlist); /* * The toast relation should have one valid index, so something is going * wrong if there is nothing. */ if (!found) elog(ERROR, "no valid index found for toast relation with Oid %u", RelationGetRelid(toastrel)); return res; } /* ---------- * toast_close_indexes * * Close an array of indexes for a toast relation and free it. This should * be called for a set of indexes opened previously with toast_open_indexes. */ void toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock) { int i; /* Close relations and clean up things */ for (i = 0; i < num_indexes; i++) index_close(toastidxs[i], lock); pfree(toastidxs); } /* ---------- * init_toast_snapshot * * Initialize an appropriate TOAST snapshot. We must use an MVCC snapshot * to initialize the TOAST snapshot; since we don't know which one to use, * just use the oldest one. This is safe: at worst, we will get a "snapshot * too old" error that might have been avoided otherwise. */ void init_toast_snapshot(Snapshot toast_snapshot) { Snapshot snapshot = GetOldestSnapshot(); /* * GetOldestSnapshot returns NULL if the session has no active snapshots. * We can get that if, for example, a procedure fetches a toasted value * into a local variable, commits, and then tries to detoast the value. * Such coding is unsafe, because once we commit there is nothing to * prevent the toast data from being deleted. Detoasting *must* happen in * the same transaction that originally fetched the toast pointer. Hence, * rather than trying to band-aid over the problem, throw an error. (This * is not very much protection, because in many scenarios the procedure * would have already created a new transaction snapshot, preventing us * from detecting the problem. But it's better than nothing, and for sure * we shouldn't expend code on masking the problem more.) */ if (snapshot == NULL) elog(ERROR, "cannot fetch toast data without an active snapshot"); /* * Catalog snapshots can be returned by GetOldestSnapshot() even if not * registered or active. That easily hides bugs around not having a * snapshot set up - most of the time there is a valid catalog snapshot. * So additionally insist that the current snapshot is registered or * active. */ Assert(HaveRegisteredOrActiveSnapshot()); InitToastSnapshot(*toast_snapshot, snapshot->lsn, snapshot->whenTaken); }