diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
commit | 293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch) | |
tree | fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /contrib/amcheck | |
parent | Initial commit. (diff) | |
download | postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip |
Adding upstream version 16.2.upstream/16.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/amcheck')
-rw-r--r-- | contrib/amcheck/.gitignore | 4 | ||||
-rw-r--r-- | contrib/amcheck/Makefile | 27 | ||||
-rw-r--r-- | contrib/amcheck/amcheck--1.0--1.1.sql | 29 | ||||
-rw-r--r-- | contrib/amcheck/amcheck--1.0.sql | 24 | ||||
-rw-r--r-- | contrib/amcheck/amcheck--1.1--1.2.sql | 19 | ||||
-rw-r--r-- | contrib/amcheck/amcheck--1.2--1.3.sql | 30 | ||||
-rw-r--r-- | contrib/amcheck/amcheck.control | 5 | ||||
-rw-r--r-- | contrib/amcheck/expected/check.out | 1 | ||||
-rw-r--r-- | contrib/amcheck/expected/check_btree.out | 210 | ||||
-rw-r--r-- | contrib/amcheck/expected/check_heap.out | 240 | ||||
-rw-r--r-- | contrib/amcheck/meson.build | 48 | ||||
-rw-r--r-- | contrib/amcheck/sql/check.sql | 1 | ||||
-rw-r--r-- | contrib/amcheck/sql/check_btree.sql | 146 | ||||
-rw-r--r-- | contrib/amcheck/sql/check_heap.sql | 147 | ||||
-rw-r--r-- | contrib/amcheck/t/001_verify_heapam.pl | 286 | ||||
-rw-r--r-- | contrib/amcheck/t/002_cic.pl | 64 | ||||
-rw-r--r-- | contrib/amcheck/t/003_cic_2pc.pl | 170 | ||||
-rw-r--r-- | contrib/amcheck/t/005_pitr.pl | 82 | ||||
-rw-r--r-- | contrib/amcheck/verify_heapam.c | 2082 | ||||
-rw-r--r-- | contrib/amcheck/verify_nbtree.c | 3347 |
20 files changed, 6962 insertions, 0 deletions
diff --git a/contrib/amcheck/.gitignore b/contrib/amcheck/.gitignore new file mode 100644 index 0000000..5dcb3ff --- /dev/null +++ b/contrib/amcheck/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile new file mode 100644 index 0000000..f830bd3 --- /dev/null +++ b/contrib/amcheck/Makefile @@ -0,0 +1,27 @@ +# contrib/amcheck/Makefile + +MODULE_big = amcheck +OBJS = \ + $(WIN32RES) \ + verify_heapam.o \ + verify_nbtree.o + +EXTENSION = amcheck +DATA = amcheck--1.2--1.3.sql amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql +PGFILEDESC = "amcheck - function for verifying relation integrity" + +REGRESS = check check_btree check_heap + +EXTRA_INSTALL = contrib/pg_walinspect +TAP_TESTS = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/amcheck +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/amcheck/amcheck--1.0--1.1.sql b/contrib/amcheck/amcheck--1.0--1.1.sql new file mode 100644 index 0000000..088416e --- /dev/null +++ b/contrib/amcheck/amcheck--1.0--1.1.sql @@ -0,0 +1,29 @@ +/* contrib/amcheck/amcheck--1.0--1.1.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.1'" to load this file. \quit + +-- In order to avoid issues with dependencies when updating amcheck to 1.1, +-- create new, overloaded versions of the 1.0 functions + +-- +-- bt_index_check() +-- +CREATE FUNCTION bt_index_check(index regclass, + heapallindexed boolean) +RETURNS VOID +AS 'MODULE_PATHNAME', 'bt_index_check' +LANGUAGE C STRICT PARALLEL RESTRICTED; + +-- +-- bt_index_parent_check() +-- +CREATE FUNCTION bt_index_parent_check(index regclass, + heapallindexed boolean) +RETURNS VOID +AS 'MODULE_PATHNAME', 'bt_index_parent_check' +LANGUAGE C STRICT PARALLEL RESTRICTED; + +-- Don't want these to be available to public +REVOKE ALL ON FUNCTION bt_index_check(regclass, boolean) FROM PUBLIC; +REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean) FROM PUBLIC; diff --git a/contrib/amcheck/amcheck--1.0.sql b/contrib/amcheck/amcheck--1.0.sql new file mode 100644 index 0000000..a6612d1 --- /dev/null +++ b/contrib/amcheck/amcheck--1.0.sql @@ -0,0 +1,24 @@ +/* contrib/amcheck/amcheck--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION amcheck" to load this file. \quit + +-- +-- bt_index_check() +-- +CREATE FUNCTION bt_index_check(index regclass) +RETURNS VOID +AS 'MODULE_PATHNAME', 'bt_index_check' +LANGUAGE C STRICT PARALLEL RESTRICTED; + +-- +-- bt_index_parent_check() +-- +CREATE FUNCTION bt_index_parent_check(index regclass) +RETURNS VOID +AS 'MODULE_PATHNAME', 'bt_index_parent_check' +LANGUAGE C STRICT PARALLEL RESTRICTED; + +-- Don't want these to be available to public +REVOKE ALL ON FUNCTION bt_index_check(regclass) FROM PUBLIC; +REVOKE ALL ON FUNCTION bt_index_parent_check(regclass) FROM PUBLIC; diff --git a/contrib/amcheck/amcheck--1.1--1.2.sql b/contrib/amcheck/amcheck--1.1--1.2.sql new file mode 100644 index 0000000..883530d --- /dev/null +++ b/contrib/amcheck/amcheck--1.1--1.2.sql @@ -0,0 +1,19 @@ +/* contrib/amcheck/amcheck--1.1--1.2.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.2'" to load this file. \quit + +-- In order to avoid issues with dependencies when updating amcheck to 1.2, +-- create new, overloaded version of the 1.1 function signature + +-- +-- bt_index_parent_check() +-- +CREATE FUNCTION bt_index_parent_check(index regclass, + heapallindexed boolean, rootdescend boolean) +RETURNS VOID +AS 'MODULE_PATHNAME', 'bt_index_parent_check' +LANGUAGE C STRICT PARALLEL RESTRICTED; + +-- Don't want this to be available to public +REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean, boolean) FROM PUBLIC; diff --git a/contrib/amcheck/amcheck--1.2--1.3.sql b/contrib/amcheck/amcheck--1.2--1.3.sql new file mode 100644 index 0000000..7237ab7 --- /dev/null +++ b/contrib/amcheck/amcheck--1.2--1.3.sql @@ -0,0 +1,30 @@ +/* contrib/amcheck/amcheck--1.2--1.3.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.3'" to load this file. \quit + +-- +-- verify_heapam() +-- +CREATE FUNCTION verify_heapam(relation regclass, + on_error_stop boolean default false, + check_toast boolean default false, + skip text default 'none', + startblock bigint default null, + endblock bigint default null, + blkno OUT bigint, + offnum OUT integer, + attnum OUT integer, + msg OUT text) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'verify_heapam' +LANGUAGE C; + +-- Don't want this to be available to public +REVOKE ALL ON FUNCTION verify_heapam(regclass, + boolean, + boolean, + text, + bigint, + bigint) +FROM PUBLIC; diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control new file mode 100644 index 0000000..ab50931 --- /dev/null +++ b/contrib/amcheck/amcheck.control @@ -0,0 +1,5 @@ +# amcheck extension +comment = 'functions for verifying relation integrity' +default_version = '1.3' +module_pathname = '$libdir/amcheck' +relocatable = true diff --git a/contrib/amcheck/expected/check.out b/contrib/amcheck/expected/check.out new file mode 100644 index 0000000..5b3e6d5 --- /dev/null +++ b/contrib/amcheck/expected/check.out @@ -0,0 +1 @@ +CREATE EXTENSION amcheck; diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out new file mode 100644 index 0000000..38791bb --- /dev/null +++ b/contrib/amcheck/expected/check_btree.out @@ -0,0 +1,210 @@ +CREATE TABLE bttest_a(id int8); +CREATE TABLE bttest_b(id int8); +CREATE TABLE bttest_multi(id int8, data int8); +CREATE TABLE delete_test_table (a bigint, b bigint, c bigint, d bigint); +-- Stabalize tests +ALTER TABLE bttest_a SET (autovacuum_enabled = false); +ALTER TABLE bttest_b SET (autovacuum_enabled = false); +ALTER TABLE bttest_multi SET (autovacuum_enabled = false); +ALTER TABLE delete_test_table SET (autovacuum_enabled = false); +INSERT INTO bttest_a SELECT * FROM generate_series(1, 100000); +INSERT INTO bttest_b SELECT * FROM generate_series(100000, 1, -1); +INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i; +CREATE INDEX bttest_a_idx ON bttest_a USING btree (id) WITH (deduplicate_items = ON); +CREATE INDEX bttest_b_idx ON bttest_b USING btree (id); +CREATE UNIQUE INDEX bttest_multi_idx ON bttest_multi +USING btree (id) INCLUDE (data); +CREATE ROLE regress_bttest_role; +-- verify permissions are checked (error due to function not callable) +SET ROLE regress_bttest_role; +SELECT bt_index_check('bttest_a_idx'::regclass); +ERROR: permission denied for function bt_index_check +SELECT bt_index_parent_check('bttest_a_idx'::regclass); +ERROR: permission denied for function bt_index_parent_check +RESET ROLE; +-- we, intentionally, don't check relation permissions - it's useful +-- to run this cluster-wide with a restricted account, and as tested +-- above explicit permission has to be granted for that. +GRANT EXECUTE ON FUNCTION bt_index_check(regclass) TO regress_bttest_role; +GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass) TO regress_bttest_role; +GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO regress_bttest_role; +GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO regress_bttest_role; +SET ROLE regress_bttest_role; +SELECT bt_index_check('bttest_a_idx'); + bt_index_check +---------------- + +(1 row) + +SELECT bt_index_parent_check('bttest_a_idx'); + bt_index_parent_check +----------------------- + +(1 row) + +RESET ROLE; +-- verify plain tables are rejected (error) +SELECT bt_index_check('bttest_a'); +ERROR: "bttest_a" is not an index +SELECT bt_index_parent_check('bttest_a'); +ERROR: "bttest_a" is not an index +-- verify non-existing indexes are rejected (error) +SELECT bt_index_check(17); +ERROR: could not open relation with OID 17 +SELECT bt_index_parent_check(17); +ERROR: could not open relation with OID 17 +-- verify wrong index types are rejected (error) +BEGIN; +CREATE INDEX bttest_a_brin_idx ON bttest_a USING brin(id); +SELECT bt_index_parent_check('bttest_a_brin_idx'); +ERROR: only B-Tree indexes are supported as targets for verification +DETAIL: Relation "bttest_a_brin_idx" is not a B-Tree index. +ROLLBACK; +-- normal check outside of xact +SELECT bt_index_check('bttest_a_idx'); + bt_index_check +---------------- + +(1 row) + +-- more expansive tests +SELECT bt_index_check('bttest_a_idx', true); + bt_index_check +---------------- + +(1 row) + +SELECT bt_index_parent_check('bttest_b_idx', true); + bt_index_parent_check +----------------------- + +(1 row) + +BEGIN; +SELECT bt_index_check('bttest_a_idx'); + bt_index_check +---------------- + +(1 row) + +SELECT bt_index_parent_check('bttest_b_idx'); + bt_index_parent_check +----------------------- + +(1 row) + +-- make sure we don't have any leftover locks +SELECT * FROM pg_locks +WHERE relation = ANY(ARRAY['bttest_a', 'bttest_a_idx', 'bttest_b', 'bttest_b_idx']::regclass[]) + AND pid = pg_backend_pid(); + locktype | database | relation | page | tuple | virtualxid | transactionid | classid | objid | objsubid | virtualtransaction | pid | mode | granted | fastpath | waitstart +----------+----------+----------+------+-------+------------+---------------+---------+-------+----------+--------------------+-----+------+---------+----------+----------- +(0 rows) + +COMMIT; +-- Deduplication +TRUNCATE bttest_a; +INSERT INTO bttest_a SELECT 42 FROM generate_series(1, 2000); +SELECT bt_index_check('bttest_a_idx', true); + bt_index_check +---------------- + +(1 row) + +-- normal check outside of xact for index with included columns +SELECT bt_index_check('bttest_multi_idx'); + bt_index_check +---------------- + +(1 row) + +-- more expansive tests for index with included columns +SELECT bt_index_parent_check('bttest_multi_idx', true, true); + bt_index_parent_check +----------------------- + +(1 row) + +-- repeat expansive tests for index built using insertions +TRUNCATE bttest_multi; +INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i; +SELECT bt_index_parent_check('bttest_multi_idx', true, true); + bt_index_parent_check +----------------------- + +(1 row) + +-- +-- Test for multilevel page deletion/downlink present checks, and rootdescend +-- checks +-- +INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; +ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); +-- Delete most entries, and vacuum, deleting internal pages and creating "fast +-- root" +DELETE FROM delete_test_table WHERE a < 79990; +VACUUM delete_test_table; +SELECT bt_index_parent_check('delete_test_table_pkey', true); + bt_index_parent_check +----------------------- + +(1 row) + +-- +-- BUG #15597: must not assume consistent input toasting state when forming +-- tuple. Bloom filter must fingerprint normalized index tuple representation. +-- +CREATE TABLE toast_bug(buggy text); +ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE extended; +CREATE INDEX toasty ON toast_bug(buggy); +-- pg_attribute entry for toasty.buggy (the index) will have plain storage: +UPDATE pg_attribute SET attstorage = 'p' +WHERE attrelid = 'toasty'::regclass AND attname = 'buggy'; +-- Whereas pg_attribute entry for toast_bug.buggy (the table) still has extended storage: +SELECT attstorage FROM pg_attribute +WHERE attrelid = 'toast_bug'::regclass AND attname = 'buggy'; + attstorage +------------ + x +(1 row) + +-- Insert compressible heap tuple (comfortably exceeds TOAST_TUPLE_THRESHOLD): +INSERT INTO toast_bug SELECT repeat('a', 2200); +-- Should not get false positive report of corruption: +SELECT bt_index_check('toasty', true); + bt_index_check +---------------- + +(1 row) + +-- +-- Check that index expressions and predicates are run as the table's owner +-- +TRUNCATE bttest_a; +INSERT INTO bttest_a SELECT * FROM generate_series(1, 1000); +ALTER TABLE bttest_a OWNER TO regress_bttest_role; +-- A dummy index function checking current_user +CREATE FUNCTION ifun(int8) RETURNS int8 AS $$ +BEGIN + ASSERT current_user = 'regress_bttest_role', + format('ifun(%s) called by %s', $1, current_user); + RETURN $1; +END; +$$ LANGUAGE plpgsql IMMUTABLE; +CREATE INDEX bttest_a_expr_idx ON bttest_a ((ifun(id) + ifun(0))) + WHERE ifun(id + 10) > ifun(10); +SELECT bt_index_check('bttest_a_expr_idx', true); + bt_index_check +---------------- + +(1 row) + +-- cleanup +DROP TABLE bttest_a; +DROP TABLE bttest_b; +DROP TABLE bttest_multi; +DROP TABLE delete_test_table; +DROP TABLE toast_bug; +DROP FUNCTION ifun(int8); +DROP OWNED BY regress_bttest_role; -- permissions +DROP ROLE regress_bttest_role; diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out new file mode 100644 index 0000000..979e5e8 --- /dev/null +++ b/contrib/amcheck/expected/check_heap.out @@ -0,0 +1,240 @@ +CREATE TABLE heaptest (a integer, b text); +REVOKE ALL ON heaptest FROM PUBLIC; +-- Check that invalid skip option is rejected +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'rope'); +ERROR: invalid skip option +HINT: Valid skip options are "all-visible", "all-frozen", and "none". +-- Check specifying invalid block ranges when verifying an empty table +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 0, endblock := 0); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 5, endblock := 8); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +-- Check that valid options are not rejected nor corruption reported +-- for an empty table, and that skip enum-like parameter is case-insensitive +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-frozen'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-visible'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'None'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'All-Frozen'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'All-Visible'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'NONE'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'ALL-FROZEN'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'ALL-VISIBLE'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +-- Add some data so subsequent tests are not entirely trivial +INSERT INTO heaptest (a, b) + (SELECT gs, repeat('x', gs) + FROM generate_series(1,50) gs); +-- pg_stat_io test: +-- verify_heapam always uses a BAS_BULKREAD BufferAccessStrategy, whereas a +-- sequential scan does so only if the table is large enough when compared to +-- shared buffers (see initscan()). CREATE DATABASE ... also unconditionally +-- uses a BAS_BULKREAD strategy, but we have chosen to use a tablespace and +-- verify_heapam to provide coverage instead of adding another expensive +-- operation to the main regression test suite. +-- +-- Create an alternative tablespace and move the heaptest table to it, causing +-- it to be rewritten and all the blocks to reliably evicted from shared +-- buffers -- guaranteeing actual reads when we next select from it in the +-- same transaction. The heaptest table is smaller than the default +-- wal_skip_threshold, so a wal_level=minimal commit reads the table into +-- shared_buffers. A transaction delays that and excludes any autovacuum. +SET allow_in_place_tablespaces = true; +CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; +SELECT sum(reads) AS stats_bulkreads_before + FROM pg_stat_io WHERE context = 'bulkread' \gset +BEGIN; +ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; +-- Check that valid options are not rejected nor corruption reported +-- for a non-empty table +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-frozen'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-visible'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 0, endblock := 0); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +COMMIT; +-- verify_heapam should have read in the page written out by +-- ALTER TABLE ... SET TABLESPACE ... +-- causing an additional bulkread, which should be reflected in pg_stat_io. +SELECT pg_stat_force_next_flush(); + pg_stat_force_next_flush +-------------------------- + +(1 row) + +SELECT sum(reads) AS stats_bulkreads_after + FROM pg_stat_io WHERE context = 'bulkread' \gset +SELECT :stats_bulkreads_after > :stats_bulkreads_before; + ?column? +---------- + t +(1 row) + +CREATE ROLE regress_heaptest_role; +-- verify permissions are checked (error due to function not callable) +SET ROLE regress_heaptest_role; +SELECT * FROM verify_heapam(relation := 'heaptest'); +ERROR: permission denied for function verify_heapam +RESET ROLE; +GRANT EXECUTE ON FUNCTION verify_heapam(regclass, boolean, boolean, text, bigint, bigint) TO regress_heaptest_role; +-- verify permissions are now sufficient +SET ROLE regress_heaptest_role; +SELECT * FROM verify_heapam(relation := 'heaptest'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +RESET ROLE; +-- Check specifying invalid block ranges when verifying a non-empty table. +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 0, endblock := 10000); +ERROR: ending block number must be between 0 and 0 +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 10000, endblock := 11000); +ERROR: starting block number must be between 0 and 0 +-- Vacuum freeze to change the xids encountered in subsequent tests +VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) heaptest; +-- Check that valid options are not rejected nor corruption reported +-- for a non-empty frozen table +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-frozen'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-visible'); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 0, endblock := 0); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +-- Check that partitioned tables (the parent ones) which don't have visibility +-- maps are rejected +CREATE TABLE test_partitioned (a int, b text default repeat('x', 5000)) + PARTITION BY list (a); +SELECT * FROM verify_heapam('test_partitioned', + startblock := NULL, + endblock := NULL); +ERROR: cannot check relation "test_partitioned" +DETAIL: This operation is not supported for partitioned tables. +-- Check that valid options are not rejected nor corruption reported +-- for an empty partition table (the child one) +CREATE TABLE test_partition partition OF test_partitioned FOR VALUES IN (1); +SELECT * FROM verify_heapam('test_partition', + startblock := NULL, + endblock := NULL); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +-- Check that valid options are not rejected nor corruption reported +-- for a non-empty partition table (the child one) +INSERT INTO test_partitioned (a) (SELECT 1 FROM generate_series(1,1000) gs); +SELECT * FROM verify_heapam('test_partition', + startblock := NULL, + endblock := NULL); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +-- Check that indexes are rejected +CREATE INDEX test_index ON test_partition (a); +SELECT * FROM verify_heapam('test_index', + startblock := NULL, + endblock := NULL); +ERROR: cannot check relation "test_index" +DETAIL: This operation is not supported for indexes. +-- Check that views are rejected +CREATE VIEW test_view AS SELECT 1; +SELECT * FROM verify_heapam('test_view', + startblock := NULL, + endblock := NULL); +ERROR: cannot check relation "test_view" +DETAIL: This operation is not supported for views. +-- Check that sequences are rejected +CREATE SEQUENCE test_sequence; +SELECT * FROM verify_heapam('test_sequence', + startblock := NULL, + endblock := NULL); + blkno | offnum | attnum | msg +-------+--------+--------+----- +(0 rows) + +-- Check that foreign tables are rejected +CREATE FOREIGN DATA WRAPPER dummy; +CREATE SERVER dummy_server FOREIGN DATA WRAPPER dummy; +CREATE FOREIGN TABLE test_foreign_table () SERVER dummy_server; +SELECT * FROM verify_heapam('test_foreign_table', + startblock := NULL, + endblock := NULL); +ERROR: cannot check relation "test_foreign_table" +DETAIL: This operation is not supported for foreign tables. +-- cleanup +DROP TABLE heaptest; +DROP TABLESPACE regress_test_stats_tblspc; +DROP TABLE test_partition; +DROP TABLE test_partitioned; +DROP OWNED BY regress_heaptest_role; -- permissions +DROP ROLE regress_heaptest_role; diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build new file mode 100644 index 0000000..78a46bf --- /dev/null +++ b/contrib/amcheck/meson.build @@ -0,0 +1,48 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +amcheck_sources = files( + 'verify_heapam.c', + 'verify_nbtree.c', +) + +if host_system == 'windows' + amcheck_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'amcheck', + '--FILEDESC', 'amcheck - function for verifying relation integrity',]) +endif + +amcheck = shared_module('amcheck', + amcheck_sources, + kwargs: contrib_mod_args, +) +contrib_targets += amcheck + +install_data( + 'amcheck.control', + 'amcheck--1.0.sql', + 'amcheck--1.0--1.1.sql', + 'amcheck--1.1--1.2.sql', + 'amcheck--1.2--1.3.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'amcheck', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'check', + 'check_btree', + 'check_heap', + ], + }, + 'tap': { + 'tests': [ + 't/001_verify_heapam.pl', + 't/002_cic.pl', + 't/003_cic_2pc.pl', + 't/005_pitr.pl', + ], + }, +} diff --git a/contrib/amcheck/sql/check.sql b/contrib/amcheck/sql/check.sql new file mode 100644 index 0000000..5b3e6d5 --- /dev/null +++ b/contrib/amcheck/sql/check.sql @@ -0,0 +1 @@ +CREATE EXTENSION amcheck; diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql new file mode 100644 index 0000000..033c04b --- /dev/null +++ b/contrib/amcheck/sql/check_btree.sql @@ -0,0 +1,146 @@ +CREATE TABLE bttest_a(id int8); +CREATE TABLE bttest_b(id int8); +CREATE TABLE bttest_multi(id int8, data int8); +CREATE TABLE delete_test_table (a bigint, b bigint, c bigint, d bigint); + +-- Stabalize tests +ALTER TABLE bttest_a SET (autovacuum_enabled = false); +ALTER TABLE bttest_b SET (autovacuum_enabled = false); +ALTER TABLE bttest_multi SET (autovacuum_enabled = false); +ALTER TABLE delete_test_table SET (autovacuum_enabled = false); + +INSERT INTO bttest_a SELECT * FROM generate_series(1, 100000); +INSERT INTO bttest_b SELECT * FROM generate_series(100000, 1, -1); +INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i; + +CREATE INDEX bttest_a_idx ON bttest_a USING btree (id) WITH (deduplicate_items = ON); +CREATE INDEX bttest_b_idx ON bttest_b USING btree (id); +CREATE UNIQUE INDEX bttest_multi_idx ON bttest_multi +USING btree (id) INCLUDE (data); + +CREATE ROLE regress_bttest_role; + +-- verify permissions are checked (error due to function not callable) +SET ROLE regress_bttest_role; +SELECT bt_index_check('bttest_a_idx'::regclass); +SELECT bt_index_parent_check('bttest_a_idx'::regclass); +RESET ROLE; + +-- we, intentionally, don't check relation permissions - it's useful +-- to run this cluster-wide with a restricted account, and as tested +-- above explicit permission has to be granted for that. +GRANT EXECUTE ON FUNCTION bt_index_check(regclass) TO regress_bttest_role; +GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass) TO regress_bttest_role; +GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO regress_bttest_role; +GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO regress_bttest_role; +SET ROLE regress_bttest_role; +SELECT bt_index_check('bttest_a_idx'); +SELECT bt_index_parent_check('bttest_a_idx'); +RESET ROLE; + +-- verify plain tables are rejected (error) +SELECT bt_index_check('bttest_a'); +SELECT bt_index_parent_check('bttest_a'); + +-- verify non-existing indexes are rejected (error) +SELECT bt_index_check(17); +SELECT bt_index_parent_check(17); + +-- verify wrong index types are rejected (error) +BEGIN; +CREATE INDEX bttest_a_brin_idx ON bttest_a USING brin(id); +SELECT bt_index_parent_check('bttest_a_brin_idx'); +ROLLBACK; + +-- normal check outside of xact +SELECT bt_index_check('bttest_a_idx'); +-- more expansive tests +SELECT bt_index_check('bttest_a_idx', true); +SELECT bt_index_parent_check('bttest_b_idx', true); + +BEGIN; +SELECT bt_index_check('bttest_a_idx'); +SELECT bt_index_parent_check('bttest_b_idx'); +-- make sure we don't have any leftover locks +SELECT * FROM pg_locks +WHERE relation = ANY(ARRAY['bttest_a', 'bttest_a_idx', 'bttest_b', 'bttest_b_idx']::regclass[]) + AND pid = pg_backend_pid(); +COMMIT; + +-- Deduplication +TRUNCATE bttest_a; +INSERT INTO bttest_a SELECT 42 FROM generate_series(1, 2000); +SELECT bt_index_check('bttest_a_idx', true); + +-- normal check outside of xact for index with included columns +SELECT bt_index_check('bttest_multi_idx'); +-- more expansive tests for index with included columns +SELECT bt_index_parent_check('bttest_multi_idx', true, true); + +-- repeat expansive tests for index built using insertions +TRUNCATE bttest_multi; +INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i; +SELECT bt_index_parent_check('bttest_multi_idx', true, true); + +-- +-- Test for multilevel page deletion/downlink present checks, and rootdescend +-- checks +-- +INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; +ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); +-- Delete most entries, and vacuum, deleting internal pages and creating "fast +-- root" +DELETE FROM delete_test_table WHERE a < 79990; +VACUUM delete_test_table; +SELECT bt_index_parent_check('delete_test_table_pkey', true); + +-- +-- BUG #15597: must not assume consistent input toasting state when forming +-- tuple. Bloom filter must fingerprint normalized index tuple representation. +-- +CREATE TABLE toast_bug(buggy text); +ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE extended; +CREATE INDEX toasty ON toast_bug(buggy); + +-- pg_attribute entry for toasty.buggy (the index) will have plain storage: +UPDATE pg_attribute SET attstorage = 'p' +WHERE attrelid = 'toasty'::regclass AND attname = 'buggy'; + +-- Whereas pg_attribute entry for toast_bug.buggy (the table) still has extended storage: +SELECT attstorage FROM pg_attribute +WHERE attrelid = 'toast_bug'::regclass AND attname = 'buggy'; + +-- Insert compressible heap tuple (comfortably exceeds TOAST_TUPLE_THRESHOLD): +INSERT INTO toast_bug SELECT repeat('a', 2200); +-- Should not get false positive report of corruption: +SELECT bt_index_check('toasty', true); + +-- +-- Check that index expressions and predicates are run as the table's owner +-- +TRUNCATE bttest_a; +INSERT INTO bttest_a SELECT * FROM generate_series(1, 1000); +ALTER TABLE bttest_a OWNER TO regress_bttest_role; +-- A dummy index function checking current_user +CREATE FUNCTION ifun(int8) RETURNS int8 AS $$ +BEGIN + ASSERT current_user = 'regress_bttest_role', + format('ifun(%s) called by %s', $1, current_user); + RETURN $1; +END; +$$ LANGUAGE plpgsql IMMUTABLE; + +CREATE INDEX bttest_a_expr_idx ON bttest_a ((ifun(id) + ifun(0))) + WHERE ifun(id + 10) > ifun(10); + +SELECT bt_index_check('bttest_a_expr_idx', true); + +-- cleanup +DROP TABLE bttest_a; +DROP TABLE bttest_b; +DROP TABLE bttest_multi; +DROP TABLE delete_test_table; +DROP TABLE toast_bug; +DROP FUNCTION ifun(int8); +DROP OWNED BY regress_bttest_role; -- permissions +DROP ROLE regress_bttest_role; diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql new file mode 100644 index 0000000..1745bae --- /dev/null +++ b/contrib/amcheck/sql/check_heap.sql @@ -0,0 +1,147 @@ +CREATE TABLE heaptest (a integer, b text); +REVOKE ALL ON heaptest FROM PUBLIC; + +-- Check that invalid skip option is rejected +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'rope'); + +-- Check specifying invalid block ranges when verifying an empty table +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 0, endblock := 0); +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 5, endblock := 8); + +-- Check that valid options are not rejected nor corruption reported +-- for an empty table, and that skip enum-like parameter is case-insensitive +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-frozen'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-visible'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'None'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'All-Frozen'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'All-Visible'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'NONE'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'ALL-FROZEN'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'ALL-VISIBLE'); + + +-- Add some data so subsequent tests are not entirely trivial +INSERT INTO heaptest (a, b) + (SELECT gs, repeat('x', gs) + FROM generate_series(1,50) gs); + +-- pg_stat_io test: +-- verify_heapam always uses a BAS_BULKREAD BufferAccessStrategy, whereas a +-- sequential scan does so only if the table is large enough when compared to +-- shared buffers (see initscan()). CREATE DATABASE ... also unconditionally +-- uses a BAS_BULKREAD strategy, but we have chosen to use a tablespace and +-- verify_heapam to provide coverage instead of adding another expensive +-- operation to the main regression test suite. +-- +-- Create an alternative tablespace and move the heaptest table to it, causing +-- it to be rewritten and all the blocks to reliably evicted from shared +-- buffers -- guaranteeing actual reads when we next select from it in the +-- same transaction. The heaptest table is smaller than the default +-- wal_skip_threshold, so a wal_level=minimal commit reads the table into +-- shared_buffers. A transaction delays that and excludes any autovacuum. +SET allow_in_place_tablespaces = true; +CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; +SELECT sum(reads) AS stats_bulkreads_before + FROM pg_stat_io WHERE context = 'bulkread' \gset +BEGIN; +ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; +-- Check that valid options are not rejected nor corruption reported +-- for a non-empty table +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-frozen'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-visible'); +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 0, endblock := 0); +COMMIT; + +-- verify_heapam should have read in the page written out by +-- ALTER TABLE ... SET TABLESPACE ... +-- causing an additional bulkread, which should be reflected in pg_stat_io. +SELECT pg_stat_force_next_flush(); +SELECT sum(reads) AS stats_bulkreads_after + FROM pg_stat_io WHERE context = 'bulkread' \gset +SELECT :stats_bulkreads_after > :stats_bulkreads_before; + +CREATE ROLE regress_heaptest_role; + +-- verify permissions are checked (error due to function not callable) +SET ROLE regress_heaptest_role; +SELECT * FROM verify_heapam(relation := 'heaptest'); +RESET ROLE; + +GRANT EXECUTE ON FUNCTION verify_heapam(regclass, boolean, boolean, text, bigint, bigint) TO regress_heaptest_role; + +-- verify permissions are now sufficient +SET ROLE regress_heaptest_role; +SELECT * FROM verify_heapam(relation := 'heaptest'); +RESET ROLE; + +-- Check specifying invalid block ranges when verifying a non-empty table. +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 0, endblock := 10000); +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 10000, endblock := 11000); + +-- Vacuum freeze to change the xids encountered in subsequent tests +VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) heaptest; + +-- Check that valid options are not rejected nor corruption reported +-- for a non-empty frozen table +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-frozen'); +SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'all-visible'); +SELECT * FROM verify_heapam(relation := 'heaptest', startblock := 0, endblock := 0); + +-- Check that partitioned tables (the parent ones) which don't have visibility +-- maps are rejected +CREATE TABLE test_partitioned (a int, b text default repeat('x', 5000)) + PARTITION BY list (a); +SELECT * FROM verify_heapam('test_partitioned', + startblock := NULL, + endblock := NULL); + +-- Check that valid options are not rejected nor corruption reported +-- for an empty partition table (the child one) +CREATE TABLE test_partition partition OF test_partitioned FOR VALUES IN (1); +SELECT * FROM verify_heapam('test_partition', + startblock := NULL, + endblock := NULL); + +-- Check that valid options are not rejected nor corruption reported +-- for a non-empty partition table (the child one) +INSERT INTO test_partitioned (a) (SELECT 1 FROM generate_series(1,1000) gs); +SELECT * FROM verify_heapam('test_partition', + startblock := NULL, + endblock := NULL); + +-- Check that indexes are rejected +CREATE INDEX test_index ON test_partition (a); +SELECT * FROM verify_heapam('test_index', + startblock := NULL, + endblock := NULL); + +-- Check that views are rejected +CREATE VIEW test_view AS SELECT 1; +SELECT * FROM verify_heapam('test_view', + startblock := NULL, + endblock := NULL); + +-- Check that sequences are rejected +CREATE SEQUENCE test_sequence; +SELECT * FROM verify_heapam('test_sequence', + startblock := NULL, + endblock := NULL); + +-- Check that foreign tables are rejected +CREATE FOREIGN DATA WRAPPER dummy; +CREATE SERVER dummy_server FOREIGN DATA WRAPPER dummy; +CREATE FOREIGN TABLE test_foreign_table () SERVER dummy_server; +SELECT * FROM verify_heapam('test_foreign_table', + startblock := NULL, + endblock := NULL); + +-- cleanup +DROP TABLE heaptest; +DROP TABLESPACE regress_test_stats_tblspc; +DROP TABLE test_partition; +DROP TABLE test_partitioned; +DROP OWNED BY regress_heaptest_role; -- permissions +DROP ROLE regress_heaptest_role; diff --git a/contrib/amcheck/t/001_verify_heapam.pl b/contrib/amcheck/t/001_verify_heapam.pl new file mode 100644 index 0000000..46d5b53 --- /dev/null +++ b/contrib/amcheck/t/001_verify_heapam.pl @@ -0,0 +1,286 @@ + +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +use strict; +use warnings; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +use Test::More; + +my ($node, $result); + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('test'); +$node->init; +$node->append_conf('postgresql.conf', 'autovacuum=off'); +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); + +# +# Check a table with data loaded but no corruption, freezing, etc. +# +fresh_test_table('test'); +check_all_options_uncorrupted('test', 'plain'); + +# +# Check a corrupt table +# +fresh_test_table('test'); +corrupt_first_page('test'); +detects_heap_corruption("verify_heapam('test')", "plain corrupted table"); +detects_heap_corruption( + "verify_heapam('test', skip := 'all-visible')", + "plain corrupted table skipping all-visible"); +detects_heap_corruption( + "verify_heapam('test', skip := 'all-frozen')", + "plain corrupted table skipping all-frozen"); +detects_heap_corruption( + "verify_heapam('test', check_toast := false)", + "plain corrupted table skipping toast"); +detects_heap_corruption( + "verify_heapam('test', startblock := 0, endblock := 0)", + "plain corrupted table checking only block zero"); + +# +# Check a corrupt table with all-frozen data +# +fresh_test_table('test'); +$node->safe_psql('postgres', q(VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) test)); +detects_no_corruption("verify_heapam('test')", + "all-frozen not corrupted table"); +corrupt_first_page('test'); +detects_heap_corruption("verify_heapam('test')", + "all-frozen corrupted table"); +detects_no_corruption( + "verify_heapam('test', skip := 'all-frozen')", + "all-frozen corrupted table skipping all-frozen"); + +# +# Check a sequence with no corruption. The current implementation of sequences +# doesn't require its own test setup, since sequences are really just heap +# tables under-the-hood. To guard against future implementation changes made +# without remembering to update verify_heapam, we create and exercise a +# sequence, checking along the way that it passes corruption checks. +# +fresh_test_sequence('test_seq'); +check_all_options_uncorrupted('test_seq', 'plain'); +advance_test_sequence('test_seq'); +check_all_options_uncorrupted('test_seq', 'plain'); +set_test_sequence('test_seq'); +check_all_options_uncorrupted('test_seq', 'plain'); +reset_test_sequence('test_seq'); +check_all_options_uncorrupted('test_seq', 'plain'); + +# Returns the filesystem path for the named relation. +sub relation_filepath +{ + my ($relname) = @_; + + my $pgdata = $node->data_dir; + my $rel = $node->safe_psql('postgres', + qq(SELECT pg_relation_filepath('$relname'))); + die "path not found for relation $relname" unless defined $rel; + return "$pgdata/$rel"; +} + +# Returns the fully qualified name of the toast table for the named relation +sub get_toast_for +{ + my ($relname) = @_; + + return $node->safe_psql( + 'postgres', qq( + SELECT 'pg_toast.' || t.relname + FROM pg_catalog.pg_class c, pg_catalog.pg_class t + WHERE c.relname = '$relname' + AND c.reltoastrelid = t.oid)); +} + +# (Re)create and populate a test table of the given name. +sub fresh_test_table +{ + my ($relname) = @_; + + return $node->safe_psql( + 'postgres', qq( + DROP TABLE IF EXISTS $relname CASCADE; + CREATE TABLE $relname (a integer, b text); + ALTER TABLE $relname SET (autovacuum_enabled=false); + ALTER TABLE $relname ALTER b SET STORAGE external; + INSERT INTO $relname (a, b) + (SELECT gs, repeat('b',gs*10) FROM generate_series(1,1000) gs); + BEGIN; + SAVEPOINT s1; + SELECT 1 FROM $relname WHERE a = 42 FOR UPDATE; + UPDATE $relname SET b = b WHERE a = 42; + RELEASE s1; + SAVEPOINT s1; + SELECT 1 FROM $relname WHERE a = 42 FOR UPDATE; + UPDATE $relname SET b = b WHERE a = 42; + COMMIT; + )); +} + +# Create a test sequence of the given name. +sub fresh_test_sequence +{ + my ($seqname) = @_; + + return $node->safe_psql( + 'postgres', qq( + DROP SEQUENCE IF EXISTS $seqname CASCADE; + CREATE SEQUENCE $seqname + INCREMENT BY 13 + MINVALUE 17 + START WITH 23; + SELECT nextval('$seqname'); + SELECT setval('$seqname', currval('$seqname') + nextval('$seqname')); + )); +} + +# Call SQL functions to increment the sequence +sub advance_test_sequence +{ + my ($seqname) = @_; + + return $node->safe_psql( + 'postgres', qq( + SELECT nextval('$seqname'); + )); +} + +# Call SQL functions to set the sequence +sub set_test_sequence +{ + my ($seqname) = @_; + + return $node->safe_psql( + 'postgres', qq( + SELECT setval('$seqname', 102); + )); +} + +# Call SQL functions to reset the sequence +sub reset_test_sequence +{ + my ($seqname) = @_; + + return $node->safe_psql( + 'postgres', qq( + ALTER SEQUENCE $seqname RESTART WITH 51 + )); +} + +# Stops the test node, corrupts the first page of the named relation, and +# restarts the node. +sub corrupt_first_page +{ + my ($relname) = @_; + my $relpath = relation_filepath($relname); + + $node->stop; + + my $fh; + open($fh, '+<', $relpath) + or BAIL_OUT("open failed: $!"); + binmode $fh; + + # Corrupt some line pointers. The values are chosen to hit the + # various line-pointer-corruption checks in verify_heapam.c + # on both little-endian and big-endian architectures. + sysseek($fh, 32, 0) + or BAIL_OUT("sysseek failed: $!"); + syswrite( + $fh, + pack("L*", + 0xAAA15550, 0xAAA0D550, 0x00010000, + 0x00008000, 0x0000800F, 0x001e8000) + ) or BAIL_OUT("syswrite failed: $!"); + close($fh) + or BAIL_OUT("close failed: $!"); + + $node->start; +} + +sub detects_heap_corruption +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + + my ($function, $testname) = @_; + + detects_corruption( + $function, + $testname, + qr/line pointer redirection to item at offset \d+ precedes minimum offset \d+/, + qr/line pointer redirection to item at offset \d+ exceeds maximum offset \d+/, + qr/line pointer to page offset \d+ is not maximally aligned/, + qr/line pointer length \d+ is less than the minimum tuple header size \d+/, + qr/line pointer to page offset \d+ with length \d+ ends beyond maximum page offset \d+/, + ); +} + +sub detects_corruption +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + + my ($function, $testname, @re) = @_; + + my $result = $node->safe_psql('postgres', qq(SELECT * FROM $function)); + like($result, $_, $testname) for (@re); +} + +sub detects_no_corruption +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + + my ($function, $testname) = @_; + + my $result = $node->safe_psql('postgres', qq(SELECT * FROM $function)); + is($result, '', $testname); +} + +# Check various options are stable (don't abort) and do not report corruption +# when running verify_heapam on an uncorrupted test table. +# +# The relname *must* be an uncorrupted table, or this will fail. +# +# The prefix is used to identify the test, along with the options, +# and should be unique. +sub check_all_options_uncorrupted +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + + my ($relname, $prefix) = @_; + + for my $stop (qw(true false)) + { + for my $check_toast (qw(true false)) + { + for my $skip ("'none'", "'all-frozen'", "'all-visible'") + { + for my $startblock (qw(NULL 0)) + { + for my $endblock (qw(NULL 0)) + { + my $opts = + "on_error_stop := $stop, " + . "check_toast := $check_toast, " + . "skip := $skip, " + . "startblock := $startblock, " + . "endblock := $endblock"; + + detects_no_corruption( + "verify_heapam('$relname', $opts)", + "$prefix: $opts"); + } + } + } + } + } +} + +done_testing(); diff --git a/contrib/amcheck/t/002_cic.pl b/contrib/amcheck/t/002_cic.pl new file mode 100644 index 0000000..42a047a --- /dev/null +++ b/contrib/amcheck/t/002_cic.pl @@ -0,0 +1,64 @@ + +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +# Test CREATE INDEX CONCURRENTLY with concurrent modifications +use strict; +use warnings; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +use Test::More; + +my ($node, $result); + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('CIC_test'); +$node->init; +$node->append_conf('postgresql.conf', + 'lock_timeout = ' . (1000 * $PostgreSQL::Test::Utils::timeout_default)); +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE TABLE tbl(i int))); +$node->safe_psql('postgres', q(CREATE INDEX idx ON tbl(i))); + +# +# Stress CIC with pgbench. +# +# pgbench might try to launch more than one instance of the CIC +# transaction concurrently. That would deadlock, so use an advisory +# lock to ensure only one CIC runs at a time. +# +$node->pgbench( + '--no-vacuum --client=5 --transactions=100', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent INSERTs and CIC', + { + '002_pgbench_concurrent_transaction' => q( + BEGIN; + INSERT INTO tbl VALUES(0); + COMMIT; + ), + '002_pgbench_concurrent_transaction_savepoints' => q( + BEGIN; + SAVEPOINT s1; + INSERT INTO tbl VALUES(0); + COMMIT; + ), + '002_pgbench_concurrent_cic' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + DROP INDEX CONCURRENTLY idx; + CREATE INDEX CONCURRENTLY idx ON tbl(i); + SELECT bt_index_check('idx',true); + SELECT pg_advisory_unlock(42); + \endif + ) + }); + +$node->stop; +done_testing(); diff --git a/contrib/amcheck/t/003_cic_2pc.pl b/contrib/amcheck/t/003_cic_2pc.pl new file mode 100644 index 0000000..3279a25 --- /dev/null +++ b/contrib/amcheck/t/003_cic_2pc.pl @@ -0,0 +1,170 @@ + +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +# Test CREATE INDEX CONCURRENTLY with concurrent prepared-xact modifications +use strict; +use warnings; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +use Test::More; + +Test::More->builder->todo_start('filesystem bug') + if PostgreSQL::Test::Utils::has_wal_read_bug; + +my ($node, $result); + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('CIC_2PC_test'); +$node->init; +$node->append_conf('postgresql.conf', 'max_prepared_transactions = 10'); +$node->append_conf('postgresql.conf', + 'lock_timeout = ' . (1000 * $PostgreSQL::Test::Utils::timeout_default)); +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE TABLE tbl(i int))); + + +# +# Run 3 overlapping 2PC transactions with CIC +# +# We have two concurrent background psql processes: $main_h for INSERTs and +# $cic_h for CIC. Also, we use non-background psql for some COMMIT PREPARED +# statements. +# + +my $main_h = $node->background_psql('postgres'); + +$main_h->query_safe( + q( +BEGIN; +INSERT INTO tbl VALUES(0); +)); + +my $cic_h = $node->background_psql('postgres'); + +$cic_h->query_until( + qr/start/, q( +\echo start +CREATE INDEX CONCURRENTLY idx ON tbl(i); +)); + +$main_h->query_safe( + q( +PREPARE TRANSACTION 'a'; +)); + +$main_h->query_safe( + q( +BEGIN; +INSERT INTO tbl VALUES(0); +)); + +$node->safe_psql('postgres', q(COMMIT PREPARED 'a';)); + +$main_h->query_safe( + q( +PREPARE TRANSACTION 'b'; +BEGIN; +INSERT INTO tbl VALUES(0); +)); + +$node->safe_psql('postgres', q(COMMIT PREPARED 'b';)); + +$main_h->query_safe( + q( +PREPARE TRANSACTION 'c'; +COMMIT PREPARED 'c'; +)); + +$main_h->quit; +$cic_h->quit; + +$result = $node->psql('postgres', q(SELECT bt_index_check('idx',true))); +is($result, '0', 'bt_index_check after overlapping 2PC'); + + +# +# Server restart shall not change whether prepared xact blocks CIC +# + +$node->safe_psql( + 'postgres', q( +BEGIN; +INSERT INTO tbl VALUES(0); +PREPARE TRANSACTION 'spans_restart'; +BEGIN; +CREATE TABLE unused (); +PREPARE TRANSACTION 'persists_forever'; +)); +$node->restart; + +my $reindex_h = $node->background_psql('postgres'); +$reindex_h->query_until( + qr/start/, q( +\echo start +DROP INDEX CONCURRENTLY idx; +CREATE INDEX CONCURRENTLY idx ON tbl(i); +)); + +$node->safe_psql('postgres', "COMMIT PREPARED 'spans_restart'"); +$reindex_h->quit; +$result = $node->psql('postgres', q(SELECT bt_index_check('idx',true))); +is($result, '0', 'bt_index_check after 2PC and restart'); + + +# +# Stress CIC+2PC with pgbench +# +# pgbench might try to launch more than one instance of the CIC +# transaction concurrently. That would deadlock, so use an advisory +# lock to ensure only one CIC runs at a time. + +# Fix broken index first +$node->safe_psql('postgres', q(REINDEX TABLE tbl;)); + +# Run pgbench. +$node->pgbench( + '--no-vacuum --client=5 --transactions=100', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent INSERTs w/ 2PC and CIC', + { + '003_pgbench_concurrent_2pc' => q( + BEGIN; + INSERT INTO tbl VALUES(0); + PREPARE TRANSACTION 'c:client_id'; + COMMIT PREPARED 'c:client_id'; + ), + '003_pgbench_concurrent_2pc_savepoint' => q( + BEGIN; + SAVEPOINT s1; + INSERT INTO tbl VALUES(0); + PREPARE TRANSACTION 'c:client_id'; + COMMIT PREPARED 'c:client_id'; + ), + '003_pgbench_concurrent_cic' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + DROP INDEX CONCURRENTLY idx; + CREATE INDEX CONCURRENTLY idx ON tbl(i); + SELECT bt_index_check('idx',true); + SELECT pg_advisory_unlock(42); + \endif + ), + '004_pgbench_concurrent_ric' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + REINDEX INDEX CONCURRENTLY idx; + SELECT bt_index_check('idx',true); + SELECT pg_advisory_unlock(42); + \endif + ) + }); + +$node->stop; +done_testing(); diff --git a/contrib/amcheck/t/005_pitr.pl b/contrib/amcheck/t/005_pitr.pl new file mode 100644 index 0000000..6bcc159 --- /dev/null +++ b/contrib/amcheck/t/005_pitr.pl @@ -0,0 +1,82 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +# Test integrity of intermediate states by PITR to those states +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# origin node: generate WAL records of interest. +my $origin = PostgreSQL::Test::Cluster->new('origin'); +$origin->init(has_archiving => 1, allows_streaming => 1); +$origin->append_conf('postgresql.conf', 'autovacuum = off'); +$origin->start; +$origin->backup('my_backup'); +# Create a table with each of 6 PK values spanning 1/4 of a block. Delete the +# first four, so one index leaf is eligible for deletion. Make a replication +# slot just so pg_walinspect will always have access to later WAL. +my $setup = <<EOSQL; +BEGIN; +CREATE EXTENSION amcheck; +CREATE EXTENSION pg_walinspect; +CREATE TABLE not_leftmost (c text STORAGE PLAIN); +INSERT INTO not_leftmost + SELECT repeat(n::text, database_block_size / 4) + FROM generate_series(1,6) t(n), pg_control_init(); +ALTER TABLE not_leftmost ADD CONSTRAINT not_leftmost_pk PRIMARY KEY (c); +DELETE FROM not_leftmost WHERE c ~ '^[1-4]'; +SELECT pg_create_physical_replication_slot('for_walinspect', true, false); +COMMIT; +EOSQL +$origin->safe_psql('postgres', $setup); +my $before_vacuum_lsn = + $origin->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); +# VACUUM to delete the aforementioned leaf page. Force an XLogFlush() by +# dropping a permanent table. That way, the XLogReader infrastructure can +# always see VACUUM's records, even under synchronous_commit=off. Finally, +# find the LSN of that VACUUM's last UNLINK_PAGE record. +my $vacuum = <<EOSQL; +SET synchronous_commit = off; +VACUUM (VERBOSE, INDEX_CLEANUP ON) not_leftmost; +CREATE TABLE XLogFlush (); +DROP TABLE XLogFlush; +SELECT max(start_lsn) + FROM pg_get_wal_records_info('$before_vacuum_lsn', 'FFFFFFFF/FFFFFFFF') + WHERE resource_manager = 'Btree' AND record_type = 'UNLINK_PAGE'; +EOSQL +my $unlink_lsn = $origin->safe_psql('postgres', $vacuum); +$origin->stop; +die "did not find UNLINK_PAGE record" unless $unlink_lsn; + +# replica node: amcheck at notable points in the WAL stream +my $replica = PostgreSQL::Test::Cluster->new('replica'); +$replica->init_from_backup($origin, 'my_backup', has_restoring => 1); +$replica->append_conf('postgresql.conf', + "recovery_target_lsn = '$unlink_lsn'"); +$replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off'); +$replica->append_conf('postgresql.conf', 'recovery_target_action = promote'); +$replica->start; +$replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") + or die "Timed out while waiting for PITR promotion"; +# recovery done; run amcheck +my $debug = "SET client_min_messages = 'debug1'"; +my ($rc, $stderr); +$rc = $replica->psql( + 'postgres', + "$debug; SELECT bt_index_parent_check('not_leftmost_pk', true)", + stderr => \$stderr); +print STDERR $stderr, "\n"; +is($rc, 0, "bt_index_parent_check passes"); +like( + $stderr, + qr/interrupted page deletion detected/, + "bt_index_parent_check: interrupted page deletion detected"); +$rc = $replica->psql( + 'postgres', + "$debug; SELECT bt_index_check('not_leftmost_pk', true)", + stderr => \$stderr); +print STDERR $stderr, "\n"; +is($rc, 0, "bt_index_check passes"); + +done_testing(); diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c new file mode 100644 index 0000000..97f3253 --- /dev/null +++ b/contrib/amcheck/verify_heapam.c @@ -0,0 +1,2082 @@ +/*------------------------------------------------------------------------- + * + * verify_heapam.c + * Functions to check postgresql heap relations for corruption + * + * Copyright (c) 2016-2023, PostgreSQL Global Development Group + * + * contrib/amcheck/verify_heapam.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/detoast.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heaptoast.h" +#include "access/multixact.h" +#include "access/toast_internals.h" +#include "access/visibilitymap.h" +#include "catalog/pg_am.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" + +PG_FUNCTION_INFO_V1(verify_heapam); + +/* The number of columns in tuples returned by verify_heapam */ +#define HEAPCHECK_RELATION_COLS 4 + +/* The largest valid toast va_rawsize */ +#define VARLENA_SIZE_LIMIT 0x3FFFFFFF + +/* + * Despite the name, we use this for reporting problems with both XIDs and + * MXIDs. + */ +typedef enum XidBoundsViolation +{ + XID_INVALID, + XID_IN_FUTURE, + XID_PRECEDES_CLUSTERMIN, + XID_PRECEDES_RELMIN, + XID_BOUNDS_OK +} XidBoundsViolation; + +typedef enum XidCommitStatus +{ + XID_COMMITTED, + XID_IS_CURRENT_XID, + XID_IN_PROGRESS, + XID_ABORTED +} XidCommitStatus; + +typedef enum SkipPages +{ + SKIP_PAGES_ALL_FROZEN, + SKIP_PAGES_ALL_VISIBLE, + SKIP_PAGES_NONE +} SkipPages; + +/* + * Struct holding information about a toasted attribute sufficient to both + * check the toasted attribute and, if found to be corrupt, to report where it + * was encountered in the main table. + */ +typedef struct ToastedAttribute +{ + struct varatt_external toast_pointer; + BlockNumber blkno; /* block in main table */ + OffsetNumber offnum; /* offset in main table */ + AttrNumber attnum; /* attribute in main table */ +} ToastedAttribute; + +/* + * Struct holding the running context information during + * a lifetime of a verify_heapam execution. + */ +typedef struct HeapCheckContext +{ + /* + * Cached copies of values from ShmemVariableCache and computed values + * from them. + */ + FullTransactionId next_fxid; /* ShmemVariableCache->nextXid */ + TransactionId next_xid; /* 32-bit version of next_fxid */ + TransactionId oldest_xid; /* ShmemVariableCache->oldestXid */ + FullTransactionId oldest_fxid; /* 64-bit version of oldest_xid, computed + * relative to next_fxid */ + TransactionId safe_xmin; /* this XID and newer ones can't become + * all-visible while we're running */ + + /* + * Cached copy of value from MultiXactState + */ + MultiXactId next_mxact; /* MultiXactState->nextMXact */ + MultiXactId oldest_mxact; /* MultiXactState->oldestMultiXactId */ + + /* + * Cached copies of the most recently checked xid and its status. + */ + TransactionId cached_xid; + XidCommitStatus cached_status; + + /* Values concerning the heap relation being checked */ + Relation rel; + TransactionId relfrozenxid; + FullTransactionId relfrozenfxid; + TransactionId relminmxid; + Relation toast_rel; + Relation *toast_indexes; + Relation valid_toast_index; + int num_toast_indexes; + + /* Values for iterating over pages in the relation */ + BlockNumber blkno; + BufferAccessStrategy bstrategy; + Buffer buffer; + Page page; + + /* Values for iterating over tuples within a page */ + OffsetNumber offnum; + ItemId itemid; + uint16 lp_len; + uint16 lp_off; + HeapTupleHeader tuphdr; + int natts; + + /* Values for iterating over attributes within the tuple */ + uint32 offset; /* offset in tuple data */ + AttrNumber attnum; + + /* True if tuple's xmax makes it eligible for pruning */ + bool tuple_could_be_pruned; + + /* + * List of ToastedAttribute structs for toasted attributes which are not + * eligible for pruning and should be checked + */ + List *toasted_attributes; + + /* Whether verify_heapam has yet encountered any corrupt tuples */ + bool is_corrupt; + + /* The descriptor and tuplestore for verify_heapam's result tuples */ + TupleDesc tupdesc; + Tuplestorestate *tupstore; +} HeapCheckContext; + +/* Internal implementation */ +static void check_tuple(HeapCheckContext *ctx, + bool *xmin_commit_status_ok, + XidCommitStatus *xmin_commit_status); +static void check_toast_tuple(HeapTuple toasttup, HeapCheckContext *ctx, + ToastedAttribute *ta, int32 *expected_chunk_seq, + uint32 extsize); + +static bool check_tuple_attribute(HeapCheckContext *ctx); +static void check_toasted_attribute(HeapCheckContext *ctx, + ToastedAttribute *ta); + +static bool check_tuple_header(HeapCheckContext *ctx); +static bool check_tuple_visibility(HeapCheckContext *ctx, + bool *xmin_commit_status_ok, + XidCommitStatus *xmin_commit_status); + +static void report_corruption(HeapCheckContext *ctx, char *msg); +static void report_toast_corruption(HeapCheckContext *ctx, + ToastedAttribute *ta, char *msg); +static FullTransactionId FullTransactionIdFromXidAndCtx(TransactionId xid, + const HeapCheckContext *ctx); +static void update_cached_xid_range(HeapCheckContext *ctx); +static void update_cached_mxid_range(HeapCheckContext *ctx); +static XidBoundsViolation check_mxid_in_range(MultiXactId mxid, + HeapCheckContext *ctx); +static XidBoundsViolation check_mxid_valid_in_rel(MultiXactId mxid, + HeapCheckContext *ctx); +static XidBoundsViolation get_xid_status(TransactionId xid, + HeapCheckContext *ctx, + XidCommitStatus *status); + +/* + * Scan and report corruption in heap pages, optionally reconciling toasted + * attributes with entries in the associated toast table. Intended to be + * called from SQL with the following parameters: + * + * relation: + * The Oid of the heap relation to be checked. + * + * on_error_stop: + * Whether to stop at the end of the first page for which errors are + * detected. Note that multiple rows may be returned. + * + * check_toast: + * Whether to check each toasted attribute against the toast table to + * verify that it can be found there. + * + * skip: + * What kinds of pages in the heap relation should be skipped. Valid + * options are "all-visible", "all-frozen", and "none". + * + * Returns to the SQL caller a set of tuples, each containing the location + * and a description of a corruption found in the heap. + * + * This code goes to some trouble to avoid crashing the server even if the + * table pages are badly corrupted, but it's probably not perfect. If + * check_toast is true, we'll use regular index lookups to try to fetch TOAST + * tuples, which can certainly cause crashes if the right kind of corruption + * exists in the toast table or index. No matter what parameters you pass, + * we can't protect against crashes that might occur trying to look up the + * commit status of transaction IDs (though we avoid trying to do such lookups + * for transaction IDs that can't legally appear in the table). + */ +Datum +verify_heapam(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + HeapCheckContext ctx; + Buffer vmbuffer = InvalidBuffer; + Oid relid; + bool on_error_stop; + bool check_toast; + SkipPages skip_option = SKIP_PAGES_NONE; + BlockNumber first_block; + BlockNumber last_block; + BlockNumber nblocks; + const char *skip; + + /* Check supplied arguments */ + if (PG_ARGISNULL(0)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("relation cannot be null"))); + relid = PG_GETARG_OID(0); + + if (PG_ARGISNULL(1)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("on_error_stop cannot be null"))); + on_error_stop = PG_GETARG_BOOL(1); + + if (PG_ARGISNULL(2)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("check_toast cannot be null"))); + check_toast = PG_GETARG_BOOL(2); + + if (PG_ARGISNULL(3)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("skip cannot be null"))); + skip = text_to_cstring(PG_GETARG_TEXT_PP(3)); + if (pg_strcasecmp(skip, "all-visible") == 0) + skip_option = SKIP_PAGES_ALL_VISIBLE; + else if (pg_strcasecmp(skip, "all-frozen") == 0) + skip_option = SKIP_PAGES_ALL_FROZEN; + else if (pg_strcasecmp(skip, "none") == 0) + skip_option = SKIP_PAGES_NONE; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid skip option"), + errhint("Valid skip options are \"all-visible\", \"all-frozen\", and \"none\"."))); + + memset(&ctx, 0, sizeof(HeapCheckContext)); + ctx.cached_xid = InvalidTransactionId; + ctx.toasted_attributes = NIL; + + /* + * Any xmin newer than the xmin of our snapshot can't become all-visible + * while we're running. + */ + ctx.safe_xmin = GetTransactionSnapshot()->xmin; + + /* + * If we report corruption when not examining some individual attribute, + * we need attnum to be reported as NULL. Set that up before any + * corruption reporting might happen. + */ + ctx.attnum = -1; + + /* Construct the tuplestore and tuple descriptor */ + InitMaterializedSRF(fcinfo, 0); + ctx.tupdesc = rsinfo->setDesc; + ctx.tupstore = rsinfo->setResult; + + /* Open relation, check relkind and access method */ + ctx.rel = relation_open(relid, AccessShareLock); + + /* + * Check that a relation's relkind and access method are both supported. + */ + if (!RELKIND_HAS_TABLE_AM(ctx.rel->rd_rel->relkind) && + ctx.rel->rd_rel->relkind != RELKIND_SEQUENCE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot check relation \"%s\"", + RelationGetRelationName(ctx.rel)), + errdetail_relkind_not_supported(ctx.rel->rd_rel->relkind))); + + /* + * Sequences always use heap AM, but they don't show that in the catalogs. + * Other relkinds might be using a different AM, so check. + */ + if (ctx.rel->rd_rel->relkind != RELKIND_SEQUENCE && + ctx.rel->rd_rel->relam != HEAP_TABLE_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only heap AM is supported"))); + + /* + * Early exit for unlogged relations during recovery. These will have no + * relation fork, so there won't be anything to check. We behave as if + * the relation is empty. + */ + if (ctx.rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && + RecoveryInProgress()) + { + ereport(DEBUG1, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot verify unlogged relation \"%s\" during recovery, skipping", + RelationGetRelationName(ctx.rel)))); + relation_close(ctx.rel, AccessShareLock); + PG_RETURN_NULL(); + } + + /* Early exit if the relation is empty */ + nblocks = RelationGetNumberOfBlocks(ctx.rel); + if (!nblocks) + { + relation_close(ctx.rel, AccessShareLock); + PG_RETURN_NULL(); + } + + ctx.bstrategy = GetAccessStrategy(BAS_BULKREAD); + ctx.buffer = InvalidBuffer; + ctx.page = NULL; + + /* Validate block numbers, or handle nulls. */ + if (PG_ARGISNULL(4)) + first_block = 0; + else + { + int64 fb = PG_GETARG_INT64(4); + + if (fb < 0 || fb >= nblocks) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("starting block number must be between 0 and %u", + nblocks - 1))); + first_block = (BlockNumber) fb; + } + if (PG_ARGISNULL(5)) + last_block = nblocks - 1; + else + { + int64 lb = PG_GETARG_INT64(5); + + if (lb < 0 || lb >= nblocks) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ending block number must be between 0 and %u", + nblocks - 1))); + last_block = (BlockNumber) lb; + } + + /* Optionally open the toast relation, if any. */ + if (ctx.rel->rd_rel->reltoastrelid && check_toast) + { + int offset; + + /* Main relation has associated toast relation */ + ctx.toast_rel = table_open(ctx.rel->rd_rel->reltoastrelid, + AccessShareLock); + offset = toast_open_indexes(ctx.toast_rel, + AccessShareLock, + &(ctx.toast_indexes), + &(ctx.num_toast_indexes)); + ctx.valid_toast_index = ctx.toast_indexes[offset]; + } + else + { + /* + * Main relation has no associated toast relation, or we're + * intentionally skipping it. + */ + ctx.toast_rel = NULL; + ctx.toast_indexes = NULL; + ctx.num_toast_indexes = 0; + } + + update_cached_xid_range(&ctx); + update_cached_mxid_range(&ctx); + ctx.relfrozenxid = ctx.rel->rd_rel->relfrozenxid; + ctx.relfrozenfxid = FullTransactionIdFromXidAndCtx(ctx.relfrozenxid, &ctx); + ctx.relminmxid = ctx.rel->rd_rel->relminmxid; + + if (TransactionIdIsNormal(ctx.relfrozenxid)) + ctx.oldest_xid = ctx.relfrozenxid; + + for (ctx.blkno = first_block; ctx.blkno <= last_block; ctx.blkno++) + { + OffsetNumber maxoff; + OffsetNumber predecessor[MaxOffsetNumber]; + OffsetNumber successor[MaxOffsetNumber]; + bool lp_valid[MaxOffsetNumber]; + bool xmin_commit_status_ok[MaxOffsetNumber]; + XidCommitStatus xmin_commit_status[MaxOffsetNumber]; + + CHECK_FOR_INTERRUPTS(); + + memset(predecessor, 0, sizeof(OffsetNumber) * MaxOffsetNumber); + + /* Optionally skip over all-frozen or all-visible blocks */ + if (skip_option != SKIP_PAGES_NONE) + { + int32 mapbits; + + mapbits = (int32) visibilitymap_get_status(ctx.rel, ctx.blkno, + &vmbuffer); + if (skip_option == SKIP_PAGES_ALL_FROZEN) + { + if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0) + continue; + } + + if (skip_option == SKIP_PAGES_ALL_VISIBLE) + { + if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0) + continue; + } + } + + /* Read and lock the next page. */ + ctx.buffer = ReadBufferExtended(ctx.rel, MAIN_FORKNUM, ctx.blkno, + RBM_NORMAL, ctx.bstrategy); + LockBuffer(ctx.buffer, BUFFER_LOCK_SHARE); + ctx.page = BufferGetPage(ctx.buffer); + + /* Perform tuple checks */ + maxoff = PageGetMaxOffsetNumber(ctx.page); + for (ctx.offnum = FirstOffsetNumber; ctx.offnum <= maxoff; + ctx.offnum = OffsetNumberNext(ctx.offnum)) + { + BlockNumber nextblkno; + OffsetNumber nextoffnum; + + successor[ctx.offnum] = InvalidOffsetNumber; + lp_valid[ctx.offnum] = false; + xmin_commit_status_ok[ctx.offnum] = false; + ctx.itemid = PageGetItemId(ctx.page, ctx.offnum); + + /* Skip over unused/dead line pointers */ + if (!ItemIdIsUsed(ctx.itemid) || ItemIdIsDead(ctx.itemid)) + continue; + + /* + * If this line pointer has been redirected, check that it + * redirects to a valid offset within the line pointer array + */ + if (ItemIdIsRedirected(ctx.itemid)) + { + OffsetNumber rdoffnum = ItemIdGetRedirect(ctx.itemid); + ItemId rditem; + + if (rdoffnum < FirstOffsetNumber) + { + report_corruption(&ctx, + psprintf("line pointer redirection to item at offset %u precedes minimum offset %u", + (unsigned) rdoffnum, + (unsigned) FirstOffsetNumber)); + continue; + } + if (rdoffnum > maxoff) + { + report_corruption(&ctx, + psprintf("line pointer redirection to item at offset %u exceeds maximum offset %u", + (unsigned) rdoffnum, + (unsigned) maxoff)); + continue; + } + + /* + * Since we've checked that this redirect points to a line + * pointer between FirstOffsetNumber and maxoff, it should now + * be safe to fetch the referenced line pointer. We expect it + * to be LP_NORMAL; if not, that's corruption. + */ + rditem = PageGetItemId(ctx.page, rdoffnum); + if (!ItemIdIsUsed(rditem)) + { + report_corruption(&ctx, + psprintf("redirected line pointer points to an unused item at offset %u", + (unsigned) rdoffnum)); + continue; + } + else if (ItemIdIsDead(rditem)) + { + report_corruption(&ctx, + psprintf("redirected line pointer points to a dead item at offset %u", + (unsigned) rdoffnum)); + continue; + } + else if (ItemIdIsRedirected(rditem)) + { + report_corruption(&ctx, + psprintf("redirected line pointer points to another redirected line pointer at offset %u", + (unsigned) rdoffnum)); + continue; + } + + /* + * Record the fact that this line pointer has passed basic + * sanity checking, and also the offset number to which it + * points. + */ + lp_valid[ctx.offnum] = true; + successor[ctx.offnum] = rdoffnum; + continue; + } + + /* Sanity-check the line pointer's offset and length values */ + ctx.lp_len = ItemIdGetLength(ctx.itemid); + ctx.lp_off = ItemIdGetOffset(ctx.itemid); + + if (ctx.lp_off != MAXALIGN(ctx.lp_off)) + { + report_corruption(&ctx, + psprintf("line pointer to page offset %u is not maximally aligned", + ctx.lp_off)); + continue; + } + if (ctx.lp_len < MAXALIGN(SizeofHeapTupleHeader)) + { + report_corruption(&ctx, + psprintf("line pointer length %u is less than the minimum tuple header size %u", + ctx.lp_len, + (unsigned) MAXALIGN(SizeofHeapTupleHeader))); + continue; + } + if (ctx.lp_off + ctx.lp_len > BLCKSZ) + { + report_corruption(&ctx, + psprintf("line pointer to page offset %u with length %u ends beyond maximum page offset %u", + ctx.lp_off, + ctx.lp_len, + (unsigned) BLCKSZ)); + continue; + } + + /* It should be safe to examine the tuple's header, at least */ + lp_valid[ctx.offnum] = true; + ctx.tuphdr = (HeapTupleHeader) PageGetItem(ctx.page, ctx.itemid); + ctx.natts = HeapTupleHeaderGetNatts(ctx.tuphdr); + + /* Ok, ready to check this next tuple */ + check_tuple(&ctx, + &xmin_commit_status_ok[ctx.offnum], + &xmin_commit_status[ctx.offnum]); + + /* + * If the CTID field of this tuple seems to point to another tuple + * on the same page, record that tuple as the successor of this + * one. + */ + nextblkno = ItemPointerGetBlockNumber(&(ctx.tuphdr)->t_ctid); + nextoffnum = ItemPointerGetOffsetNumber(&(ctx.tuphdr)->t_ctid); + if (nextblkno == ctx.blkno && nextoffnum != ctx.offnum && + nextoffnum >= FirstOffsetNumber && nextoffnum <= maxoff) + successor[ctx.offnum] = nextoffnum; + } + + /* + * Update chain validation. Check each line pointer that's got a valid + * successor against that successor. + */ + ctx.attnum = -1; + for (ctx.offnum = FirstOffsetNumber; ctx.offnum <= maxoff; + ctx.offnum = OffsetNumberNext(ctx.offnum)) + { + ItemId curr_lp; + ItemId next_lp; + HeapTupleHeader curr_htup; + HeapTupleHeader next_htup; + TransactionId curr_xmin; + TransactionId curr_xmax; + TransactionId next_xmin; + OffsetNumber nextoffnum = successor[ctx.offnum]; + + /* + * The current line pointer may not have a successor, either + * because it's not valid or because it didn't point to anything. + * In either case, we have to give up. + * + * If the current line pointer does point to something, it's + * possible that the target line pointer isn't valid. We have to + * give up in that case, too. + */ + if (nextoffnum == InvalidOffsetNumber || !lp_valid[nextoffnum]) + continue; + + /* We have two valid line pointers that we can examine. */ + curr_lp = PageGetItemId(ctx.page, ctx.offnum); + next_lp = PageGetItemId(ctx.page, nextoffnum); + + /* Handle the cases where the current line pointer is a redirect. */ + if (ItemIdIsRedirected(curr_lp)) + { + /* + * We should not have set successor[ctx.offnum] to a value + * other than InvalidOffsetNumber unless that line pointer is + * LP_NORMAL. + */ + Assert(ItemIdIsNormal(next_lp)); + + /* Can only redirect to a HOT tuple. */ + next_htup = (HeapTupleHeader) PageGetItem(ctx.page, next_lp); + if (!HeapTupleHeaderIsHeapOnly(next_htup)) + { + report_corruption(&ctx, + psprintf("redirected line pointer points to a non-heap-only tuple at offset %u", + (unsigned) nextoffnum)); + } + + /* HOT chains should not intersect. */ + if (predecessor[nextoffnum] != InvalidOffsetNumber) + { + report_corruption(&ctx, + psprintf("redirect line pointer points to offset %u, but offset %u also points there", + (unsigned) nextoffnum, (unsigned) predecessor[nextoffnum])); + continue; + } + + /* + * This redirect and the tuple to which it points seem to be + * part of an update chain. + */ + predecessor[nextoffnum] = ctx.offnum; + continue; + } + + /* + * If the next line pointer is a redirect, or if it's a tuple but + * the XMAX of this tuple doesn't match the XMIN of the next + * tuple, then the two aren't part of the same update chain and + * there is nothing more to do. + */ + if (ItemIdIsRedirected(next_lp)) + continue; + curr_htup = (HeapTupleHeader) PageGetItem(ctx.page, curr_lp); + curr_xmax = HeapTupleHeaderGetUpdateXid(curr_htup); + next_htup = (HeapTupleHeader) PageGetItem(ctx.page, next_lp); + next_xmin = HeapTupleHeaderGetXmin(next_htup); + if (!TransactionIdIsValid(curr_xmax) || + !TransactionIdEquals(curr_xmax, next_xmin)) + continue; + + /* HOT chains should not intersect. */ + if (predecessor[nextoffnum] != InvalidOffsetNumber) + { + report_corruption(&ctx, + psprintf("tuple points to new version at offset %u, but offset %u also points there", + (unsigned) nextoffnum, (unsigned) predecessor[nextoffnum])); + continue; + } + + /* + * This tuple and the tuple to which it points seem to be part of + * an update chain. + */ + predecessor[nextoffnum] = ctx.offnum; + + /* + * If the current tuple is marked as HOT-updated, then the next + * tuple should be marked as a heap-only tuple. Conversely, if the + * current tuple isn't marked as HOT-updated, then the next tuple + * shouldn't be marked as a heap-only tuple. + * + * NB: Can't use HeapTupleHeaderIsHotUpdated() as it checks if + * hint bits indicate xmin/xmax aborted. + */ + if (!(curr_htup->t_infomask2 & HEAP_HOT_UPDATED) && + HeapTupleHeaderIsHeapOnly(next_htup)) + { + report_corruption(&ctx, + psprintf("non-heap-only update produced a heap-only tuple at offset %u", + (unsigned) nextoffnum)); + } + if ((curr_htup->t_infomask2 & HEAP_HOT_UPDATED) && + !HeapTupleHeaderIsHeapOnly(next_htup)) + { + report_corruption(&ctx, + psprintf("heap-only update produced a non-heap only tuple at offset %u", + (unsigned) nextoffnum)); + } + + /* + * If the current tuple's xmin is still in progress but the + * successor tuple's xmin is committed, that's corruption. + * + * NB: We recheck the commit status of the current tuple's xmin + * here, because it might have committed after we checked it and + * before we checked the commit status of the successor tuple's + * xmin. This should be safe because the xmin itself can't have + * changed, only its commit status. + */ + curr_xmin = HeapTupleHeaderGetXmin(curr_htup); + if (xmin_commit_status_ok[ctx.offnum] && + xmin_commit_status[ctx.offnum] == XID_IN_PROGRESS && + xmin_commit_status_ok[nextoffnum] && + xmin_commit_status[nextoffnum] == XID_COMMITTED && + TransactionIdIsInProgress(curr_xmin)) + { + report_corruption(&ctx, + psprintf("tuple with in-progress xmin %u was updated to produce a tuple at offset %u with committed xmin %u", + (unsigned) curr_xmin, + (unsigned) ctx.offnum, + (unsigned) next_xmin)); + } + + /* + * If the current tuple's xmin is aborted but the successor + * tuple's xmin is in-progress or committed, that's corruption. + */ + if (xmin_commit_status_ok[ctx.offnum] && + xmin_commit_status[ctx.offnum] == XID_ABORTED && + xmin_commit_status_ok[nextoffnum]) + { + if (xmin_commit_status[nextoffnum] == XID_IN_PROGRESS) + report_corruption(&ctx, + psprintf("tuple with aborted xmin %u was updated to produce a tuple at offset %u with in-progress xmin %u", + (unsigned) curr_xmin, + (unsigned) ctx.offnum, + (unsigned) next_xmin)); + else if (xmin_commit_status[nextoffnum] == XID_COMMITTED) + report_corruption(&ctx, + psprintf("tuple with aborted xmin %u was updated to produce a tuple at offset %u with committed xmin %u", + (unsigned) curr_xmin, + (unsigned) ctx.offnum, + (unsigned) next_xmin)); + } + } + + /* + * An update chain can start either with a non-heap-only tuple or with + * a redirect line pointer, but not with a heap-only tuple. + * + * (This check is in a separate loop because we need the predecessor + * array to be fully populated before we can perform it.) + */ + for (ctx.offnum = FirstOffsetNumber; + ctx.offnum <= maxoff; + ctx.offnum = OffsetNumberNext(ctx.offnum)) + { + if (xmin_commit_status_ok[ctx.offnum] && + (xmin_commit_status[ctx.offnum] == XID_COMMITTED || + xmin_commit_status[ctx.offnum] == XID_IN_PROGRESS) && + predecessor[ctx.offnum] == InvalidOffsetNumber) + { + ItemId curr_lp; + + curr_lp = PageGetItemId(ctx.page, ctx.offnum); + if (!ItemIdIsRedirected(curr_lp)) + { + HeapTupleHeader curr_htup; + + curr_htup = (HeapTupleHeader) + PageGetItem(ctx.page, curr_lp); + if (HeapTupleHeaderIsHeapOnly(curr_htup)) + report_corruption(&ctx, + psprintf("tuple is root of chain but is marked as heap-only tuple")); + } + } + } + + /* clean up */ + UnlockReleaseBuffer(ctx.buffer); + + /* + * Check any toast pointers from the page whose lock we just released + */ + if (ctx.toasted_attributes != NIL) + { + ListCell *cell; + + foreach(cell, ctx.toasted_attributes) + check_toasted_attribute(&ctx, lfirst(cell)); + list_free_deep(ctx.toasted_attributes); + ctx.toasted_attributes = NIL; + } + + if (on_error_stop && ctx.is_corrupt) + break; + } + + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + /* Close the associated toast table and indexes, if any. */ + if (ctx.toast_indexes) + toast_close_indexes(ctx.toast_indexes, ctx.num_toast_indexes, + AccessShareLock); + if (ctx.toast_rel) + table_close(ctx.toast_rel, AccessShareLock); + + /* Close the main relation */ + relation_close(ctx.rel, AccessShareLock); + + PG_RETURN_NULL(); +} + +/* + * Shared internal implementation for report_corruption and + * report_toast_corruption. + */ +static void +report_corruption_internal(Tuplestorestate *tupstore, TupleDesc tupdesc, + BlockNumber blkno, OffsetNumber offnum, + AttrNumber attnum, char *msg) +{ + Datum values[HEAPCHECK_RELATION_COLS] = {0}; + bool nulls[HEAPCHECK_RELATION_COLS] = {0}; + HeapTuple tuple; + + values[0] = Int64GetDatum(blkno); + values[1] = Int32GetDatum(offnum); + values[2] = Int32GetDatum(attnum); + nulls[2] = (attnum < 0); + values[3] = CStringGetTextDatum(msg); + + /* + * In principle, there is nothing to prevent a scan over a large, highly + * corrupted table from using work_mem worth of memory building up the + * tuplestore. That's ok, but if we also leak the msg argument memory + * until the end of the query, we could exceed work_mem by more than a + * trivial amount. Therefore, free the msg argument each time we are + * called rather than waiting for our current memory context to be freed. + */ + pfree(msg); + + tuple = heap_form_tuple(tupdesc, values, nulls); + tuplestore_puttuple(tupstore, tuple); +} + +/* + * Record a single corruption found in the main table. The values in ctx should + * indicate the location of the corruption, and the msg argument should contain + * a human-readable description of the corruption. + * + * The msg argument is pfree'd by this function. + */ +static void +report_corruption(HeapCheckContext *ctx, char *msg) +{ + report_corruption_internal(ctx->tupstore, ctx->tupdesc, ctx->blkno, + ctx->offnum, ctx->attnum, msg); + ctx->is_corrupt = true; +} + +/* + * Record corruption found in the toast table. The values in ta should + * indicate the location in the main table where the toast pointer was + * encountered, and the msg argument should contain a human-readable + * description of the toast table corruption. + * + * As above, the msg argument is pfree'd by this function. + */ +static void +report_toast_corruption(HeapCheckContext *ctx, ToastedAttribute *ta, + char *msg) +{ + report_corruption_internal(ctx->tupstore, ctx->tupdesc, ta->blkno, + ta->offnum, ta->attnum, msg); + ctx->is_corrupt = true; +} + +/* + * Check for tuple header corruption. + * + * Some kinds of corruption make it unsafe to check the tuple attributes, for + * example when the line pointer refers to a range of bytes outside the page. + * In such cases, we return false (not checkable) after recording appropriate + * corruption messages. + * + * Some other kinds of tuple header corruption confuse the question of where + * the tuple attributes begin, or how long the nulls bitmap is, etc., making it + * unreasonable to attempt to check attributes, even if all candidate answers + * to those questions would not result in reading past the end of the line + * pointer or page. In such cases, like above, we record corruption messages + * about the header and then return false. + * + * Other kinds of tuple header corruption do not bear on the question of + * whether the tuple attributes can be checked, so we record corruption + * messages for them but we do not return false merely because we detected + * them. + * + * Returns whether the tuple is sufficiently sensible to undergo visibility and + * attribute checks. + */ +static bool +check_tuple_header(HeapCheckContext *ctx) +{ + HeapTupleHeader tuphdr = ctx->tuphdr; + uint16 infomask = tuphdr->t_infomask; + TransactionId curr_xmax = HeapTupleHeaderGetUpdateXid(tuphdr); + bool result = true; + unsigned expected_hoff; + + if (ctx->tuphdr->t_hoff > ctx->lp_len) + { + report_corruption(ctx, + psprintf("data begins at offset %u beyond the tuple length %u", + ctx->tuphdr->t_hoff, ctx->lp_len)); + result = false; + } + + if ((ctx->tuphdr->t_infomask & HEAP_XMAX_COMMITTED) && + (ctx->tuphdr->t_infomask & HEAP_XMAX_IS_MULTI)) + { + report_corruption(ctx, + pstrdup("multixact should not be marked committed")); + + /* + * This condition is clearly wrong, but it's not enough to justify + * skipping further checks, because we don't rely on this to determine + * whether the tuple is visible or to interpret other relevant header + * fields. + */ + } + + if (!TransactionIdIsValid(curr_xmax) && + HeapTupleHeaderIsHotUpdated(tuphdr)) + { + report_corruption(ctx, + psprintf("tuple has been HOT updated, but xmax is 0")); + + /* + * As above, even though this shouldn't happen, it's not sufficient + * justification for skipping further checks, we should still be able + * to perform sensibly. + */ + } + + if (HeapTupleHeaderIsHeapOnly(tuphdr) && + ((tuphdr->t_infomask & HEAP_UPDATED) == 0)) + { + report_corruption(ctx, + psprintf("tuple is heap only, but not the result of an update")); + + /* Here again, we can still perform further checks. */ + } + + if (infomask & HEAP_HASNULL) + expected_hoff = MAXALIGN(SizeofHeapTupleHeader + BITMAPLEN(ctx->natts)); + else + expected_hoff = MAXALIGN(SizeofHeapTupleHeader); + if (ctx->tuphdr->t_hoff != expected_hoff) + { + if ((infomask & HEAP_HASNULL) && ctx->natts == 1) + report_corruption(ctx, + psprintf("tuple data should begin at byte %u, but actually begins at byte %u (1 attribute, has nulls)", + expected_hoff, ctx->tuphdr->t_hoff)); + else if ((infomask & HEAP_HASNULL)) + report_corruption(ctx, + psprintf("tuple data should begin at byte %u, but actually begins at byte %u (%u attributes, has nulls)", + expected_hoff, ctx->tuphdr->t_hoff, ctx->natts)); + else if (ctx->natts == 1) + report_corruption(ctx, + psprintf("tuple data should begin at byte %u, but actually begins at byte %u (1 attribute, no nulls)", + expected_hoff, ctx->tuphdr->t_hoff)); + else + report_corruption(ctx, + psprintf("tuple data should begin at byte %u, but actually begins at byte %u (%u attributes, no nulls)", + expected_hoff, ctx->tuphdr->t_hoff, ctx->natts)); + result = false; + } + + return result; +} + +/* + * Checks tuple visibility so we know which further checks are safe to + * perform. + * + * If a tuple could have been inserted by a transaction that also added a + * column to the table, but which ultimately did not commit, or which has not + * yet committed, then the table's current TupleDesc might differ from the one + * used to construct this tuple, so we must not check it. + * + * As a special case, if our own transaction inserted the tuple, even if we + * added a column to the table, our TupleDesc should match. We could check the + * tuple, but choose not to do so. + * + * If a tuple has been updated or deleted, we can still read the old tuple for + * corruption checking purposes, as long as we are careful about concurrent + * vacuums. The main table tuple itself cannot be vacuumed away because we + * hold a buffer lock on the page, but if the deleting transaction is older + * than our transaction snapshot's xmin, then vacuum could remove the toast at + * any time, so we must not try to follow TOAST pointers. + * + * If xmin or xmax values are older than can be checked against clog, or appear + * to be in the future (possibly due to wrap-around), then we cannot make a + * determination about the visibility of the tuple, so we skip further checks. + * + * Returns true if the tuple itself should be checked, false otherwise. Sets + * ctx->tuple_could_be_pruned if the tuple -- and thus also any associated + * TOAST tuples -- are eligible for pruning. + * + * Sets *xmin_commit_status_ok to true if the commit status of xmin is known + * and false otherwise. If it's set to true, then also set *xmin_commit_status + * to the actual commit status. + */ +static bool +check_tuple_visibility(HeapCheckContext *ctx, bool *xmin_commit_status_ok, + XidCommitStatus *xmin_commit_status) +{ + TransactionId xmin; + TransactionId xvac; + TransactionId xmax; + XidCommitStatus xmin_status; + XidCommitStatus xvac_status; + XidCommitStatus xmax_status; + HeapTupleHeader tuphdr = ctx->tuphdr; + + ctx->tuple_could_be_pruned = true; /* have not yet proven otherwise */ + *xmin_commit_status_ok = false; /* have not yet proven otherwise */ + + /* If xmin is normal, it should be within valid range */ + xmin = HeapTupleHeaderGetXmin(tuphdr); + switch (get_xid_status(xmin, ctx, &xmin_status)) + { + case XID_INVALID: + /* Could be the result of a speculative insertion that aborted. */ + return false; + case XID_BOUNDS_OK: + *xmin_commit_status_ok = true; + *xmin_commit_status = xmin_status; + break; + case XID_IN_FUTURE: + report_corruption(ctx, + psprintf("xmin %u equals or exceeds next valid transaction ID %u:%u", + xmin, + EpochFromFullTransactionId(ctx->next_fxid), + XidFromFullTransactionId(ctx->next_fxid))); + return false; + case XID_PRECEDES_CLUSTERMIN: + report_corruption(ctx, + psprintf("xmin %u precedes oldest valid transaction ID %u:%u", + xmin, + EpochFromFullTransactionId(ctx->oldest_fxid), + XidFromFullTransactionId(ctx->oldest_fxid))); + return false; + case XID_PRECEDES_RELMIN: + report_corruption(ctx, + psprintf("xmin %u precedes relation freeze threshold %u:%u", + xmin, + EpochFromFullTransactionId(ctx->relfrozenfxid), + XidFromFullTransactionId(ctx->relfrozenfxid))); + return false; + } + + /* + * Has inserting transaction committed? + */ + if (!HeapTupleHeaderXminCommitted(tuphdr)) + { + if (HeapTupleHeaderXminInvalid(tuphdr)) + return false; /* inserter aborted, don't check */ + /* Used by pre-9.0 binary upgrades */ + else if (tuphdr->t_infomask & HEAP_MOVED_OFF) + { + xvac = HeapTupleHeaderGetXvac(tuphdr); + + switch (get_xid_status(xvac, ctx, &xvac_status)) + { + case XID_INVALID: + report_corruption(ctx, + pstrdup("old-style VACUUM FULL transaction ID for moved off tuple is invalid")); + return false; + case XID_IN_FUTURE: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple equals or exceeds next valid transaction ID %u:%u", + xvac, + EpochFromFullTransactionId(ctx->next_fxid), + XidFromFullTransactionId(ctx->next_fxid))); + return false; + case XID_PRECEDES_RELMIN: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple precedes relation freeze threshold %u:%u", + xvac, + EpochFromFullTransactionId(ctx->relfrozenfxid), + XidFromFullTransactionId(ctx->relfrozenfxid))); + return false; + case XID_PRECEDES_CLUSTERMIN: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple precedes oldest valid transaction ID %u:%u", + xvac, + EpochFromFullTransactionId(ctx->oldest_fxid), + XidFromFullTransactionId(ctx->oldest_fxid))); + return false; + case XID_BOUNDS_OK: + break; + } + + switch (xvac_status) + { + case XID_IS_CURRENT_XID: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple matches our current transaction ID", + xvac)); + return false; + case XID_IN_PROGRESS: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple appears to be in progress", + xvac)); + return false; + + case XID_COMMITTED: + + /* + * The tuple is dead, because the xvac transaction moved + * it off and committed. It's checkable, but also + * prunable. + */ + return true; + + case XID_ABORTED: + + /* + * The original xmin must have committed, because the xvac + * transaction tried to move it later. Since xvac is + * aborted, whether it's still alive now depends on the + * status of xmax. + */ + break; + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuphdr->t_infomask & HEAP_MOVED_IN) + { + xvac = HeapTupleHeaderGetXvac(tuphdr); + + switch (get_xid_status(xvac, ctx, &xvac_status)) + { + case XID_INVALID: + report_corruption(ctx, + pstrdup("old-style VACUUM FULL transaction ID for moved in tuple is invalid")); + return false; + case XID_IN_FUTURE: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple equals or exceeds next valid transaction ID %u:%u", + xvac, + EpochFromFullTransactionId(ctx->next_fxid), + XidFromFullTransactionId(ctx->next_fxid))); + return false; + case XID_PRECEDES_RELMIN: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple precedes relation freeze threshold %u:%u", + xvac, + EpochFromFullTransactionId(ctx->relfrozenfxid), + XidFromFullTransactionId(ctx->relfrozenfxid))); + return false; + case XID_PRECEDES_CLUSTERMIN: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple precedes oldest valid transaction ID %u:%u", + xvac, + EpochFromFullTransactionId(ctx->oldest_fxid), + XidFromFullTransactionId(ctx->oldest_fxid))); + return false; + case XID_BOUNDS_OK: + break; + } + + switch (xvac_status) + { + case XID_IS_CURRENT_XID: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple matches our current transaction ID", + xvac)); + return false; + case XID_IN_PROGRESS: + report_corruption(ctx, + psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple appears to be in progress", + xvac)); + return false; + + case XID_COMMITTED: + + /* + * The original xmin must have committed, because the xvac + * transaction moved it later. Whether it's still alive + * now depends on the status of xmax. + */ + break; + + case XID_ABORTED: + + /* + * The tuple is dead, because the xvac transaction moved + * it off and committed. It's checkable, but also + * prunable. + */ + return true; + } + } + else if (xmin_status != XID_COMMITTED) + { + /* + * Inserting transaction is not in progress, and not committed, so + * it might have changed the TupleDesc in ways we don't know + * about. Thus, don't try to check the tuple structure. + * + * If xmin_status happens to be XID_IS_CURRENT_XID, then in theory + * any such DDL changes ought to be visible to us, so perhaps we + * could check anyway in that case. But, for now, let's be + * conservative and treat this like any other uncommitted insert. + */ + return false; + } + } + + /* + * Okay, the inserter committed, so it was good at some point. Now what + * about the deleting transaction? + */ + + if (tuphdr->t_infomask & HEAP_XMAX_IS_MULTI) + { + /* + * xmax is a multixact, so sanity-check the MXID. Note that we do this + * prior to checking for HEAP_XMAX_INVALID or + * HEAP_XMAX_IS_LOCKED_ONLY. This might therefore complain about + * things that wouldn't actually be a problem during a normal scan, + * but eventually we're going to have to freeze, and that process will + * ignore hint bits. + * + * Even if the MXID is out of range, we still know that the original + * insert committed, so we can check the tuple itself. However, we + * can't rule out the possibility that this tuple is dead, so don't + * clear ctx->tuple_could_be_pruned. Possibly we should go ahead and + * clear that flag anyway if HEAP_XMAX_INVALID is set or if + * HEAP_XMAX_IS_LOCKED_ONLY is true, but for now we err on the side of + * avoiding possibly-bogus complaints about missing TOAST entries. + */ + xmax = HeapTupleHeaderGetRawXmax(tuphdr); + switch (check_mxid_valid_in_rel(xmax, ctx)) + { + case XID_INVALID: + report_corruption(ctx, + pstrdup("multitransaction ID is invalid")); + return true; + case XID_PRECEDES_RELMIN: + report_corruption(ctx, + psprintf("multitransaction ID %u precedes relation minimum multitransaction ID threshold %u", + xmax, ctx->relminmxid)); + return true; + case XID_PRECEDES_CLUSTERMIN: + report_corruption(ctx, + psprintf("multitransaction ID %u precedes oldest valid multitransaction ID threshold %u", + xmax, ctx->oldest_mxact)); + return true; + case XID_IN_FUTURE: + report_corruption(ctx, + psprintf("multitransaction ID %u equals or exceeds next valid multitransaction ID %u", + xmax, + ctx->next_mxact)); + return true; + case XID_BOUNDS_OK: + break; + } + } + + if (tuphdr->t_infomask & HEAP_XMAX_INVALID) + { + /* + * This tuple is live. A concurrently running transaction could + * delete it before we get around to checking the toast, but any such + * running transaction is surely not less than our safe_xmin, so the + * toast cannot be vacuumed out from under us. + */ + ctx->tuple_could_be_pruned = false; + return true; + } + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuphdr->t_infomask)) + { + /* + * "Deleting" xact really only locked it, so the tuple is live in any + * case. As above, a concurrently running transaction could delete + * it, but it cannot be vacuumed out from under us. + */ + ctx->tuple_could_be_pruned = false; + return true; + } + + if (tuphdr->t_infomask & HEAP_XMAX_IS_MULTI) + { + /* + * We already checked above that this multixact is within limits for + * this table. Now check the update xid from this multixact. + */ + xmax = HeapTupleGetUpdateXid(tuphdr); + switch (get_xid_status(xmax, ctx, &xmax_status)) + { + case XID_INVALID: + /* not LOCKED_ONLY, so it has to have an xmax */ + report_corruption(ctx, + pstrdup("update xid is invalid")); + return true; + case XID_IN_FUTURE: + report_corruption(ctx, + psprintf("update xid %u equals or exceeds next valid transaction ID %u:%u", + xmax, + EpochFromFullTransactionId(ctx->next_fxid), + XidFromFullTransactionId(ctx->next_fxid))); + return true; + case XID_PRECEDES_RELMIN: + report_corruption(ctx, + psprintf("update xid %u precedes relation freeze threshold %u:%u", + xmax, + EpochFromFullTransactionId(ctx->relfrozenfxid), + XidFromFullTransactionId(ctx->relfrozenfxid))); + return true; + case XID_PRECEDES_CLUSTERMIN: + report_corruption(ctx, + psprintf("update xid %u precedes oldest valid transaction ID %u:%u", + xmax, + EpochFromFullTransactionId(ctx->oldest_fxid), + XidFromFullTransactionId(ctx->oldest_fxid))); + return true; + case XID_BOUNDS_OK: + break; + } + + switch (xmax_status) + { + case XID_IS_CURRENT_XID: + case XID_IN_PROGRESS: + + /* + * The delete is in progress, so it cannot be visible to our + * snapshot. + */ + ctx->tuple_could_be_pruned = false; + break; + case XID_COMMITTED: + + /* + * The delete committed. Whether the toast can be vacuumed + * away depends on how old the deleting transaction is. + */ + ctx->tuple_could_be_pruned = TransactionIdPrecedes(xmax, + ctx->safe_xmin); + break; + case XID_ABORTED: + + /* + * The delete aborted or crashed. The tuple is still live. + */ + ctx->tuple_could_be_pruned = false; + break; + } + + /* Tuple itself is checkable even if it's dead. */ + return true; + } + + /* xmax is an XID, not a MXID. Sanity check it. */ + xmax = HeapTupleHeaderGetRawXmax(tuphdr); + switch (get_xid_status(xmax, ctx, &xmax_status)) + { + case XID_INVALID: + ctx->tuple_could_be_pruned = false; + return true; + case XID_IN_FUTURE: + report_corruption(ctx, + psprintf("xmax %u equals or exceeds next valid transaction ID %u:%u", + xmax, + EpochFromFullTransactionId(ctx->next_fxid), + XidFromFullTransactionId(ctx->next_fxid))); + return false; /* corrupt */ + case XID_PRECEDES_RELMIN: + report_corruption(ctx, + psprintf("xmax %u precedes relation freeze threshold %u:%u", + xmax, + EpochFromFullTransactionId(ctx->relfrozenfxid), + XidFromFullTransactionId(ctx->relfrozenfxid))); + return false; /* corrupt */ + case XID_PRECEDES_CLUSTERMIN: + report_corruption(ctx, + psprintf("xmax %u precedes oldest valid transaction ID %u:%u", + xmax, + EpochFromFullTransactionId(ctx->oldest_fxid), + XidFromFullTransactionId(ctx->oldest_fxid))); + return false; /* corrupt */ + case XID_BOUNDS_OK: + break; + } + + /* + * Whether the toast can be vacuumed away depends on how old the deleting + * transaction is. + */ + switch (xmax_status) + { + case XID_IS_CURRENT_XID: + case XID_IN_PROGRESS: + + /* + * The delete is in progress, so it cannot be visible to our + * snapshot. + */ + ctx->tuple_could_be_pruned = false; + break; + + case XID_COMMITTED: + + /* + * The delete committed. Whether the toast can be vacuumed away + * depends on how old the deleting transaction is. + */ + ctx->tuple_could_be_pruned = TransactionIdPrecedes(xmax, + ctx->safe_xmin); + break; + + case XID_ABORTED: + + /* + * The delete aborted or crashed. The tuple is still live. + */ + ctx->tuple_could_be_pruned = false; + break; + } + + /* Tuple itself is checkable even if it's dead. */ + return true; +} + + +/* + * Check the current toast tuple against the state tracked in ctx, recording + * any corruption found in ctx->tupstore. + * + * This is not equivalent to running verify_heapam on the toast table itself, + * and is not hardened against corruption of the toast table. Rather, when + * validating a toasted attribute in the main table, the sequence of toast + * tuples that store the toasted value are retrieved and checked in order, with + * each toast tuple being checked against where we are in the sequence, as well + * as each toast tuple having its varlena structure sanity checked. + * + * On entry, *expected_chunk_seq should be the chunk_seq value that we expect + * to find in toasttup. On exit, it will be updated to the value the next call + * to this function should expect to see. + */ +static void +check_toast_tuple(HeapTuple toasttup, HeapCheckContext *ctx, + ToastedAttribute *ta, int32 *expected_chunk_seq, + uint32 extsize) +{ + int32 chunk_seq; + int32 last_chunk_seq = (extsize - 1) / TOAST_MAX_CHUNK_SIZE; + Pointer chunk; + bool isnull; + int32 chunksize; + int32 expected_size; + + /* Sanity-check the sequence number. */ + chunk_seq = DatumGetInt32(fastgetattr(toasttup, 2, + ctx->toast_rel->rd_att, &isnull)); + if (isnull) + { + report_toast_corruption(ctx, ta, + psprintf("toast value %u has toast chunk with null sequence number", + ta->toast_pointer.va_valueid)); + return; + } + if (chunk_seq != *expected_chunk_seq) + { + /* Either the TOAST index is corrupt, or we don't have all chunks. */ + report_toast_corruption(ctx, ta, + psprintf("toast value %u index scan returned chunk %d when expecting chunk %d", + ta->toast_pointer.va_valueid, + chunk_seq, *expected_chunk_seq)); + } + *expected_chunk_seq = chunk_seq + 1; + + /* Sanity-check the chunk data. */ + chunk = DatumGetPointer(fastgetattr(toasttup, 3, + ctx->toast_rel->rd_att, &isnull)); + if (isnull) + { + report_toast_corruption(ctx, ta, + psprintf("toast value %u chunk %d has null data", + ta->toast_pointer.va_valueid, + chunk_seq)); + return; + } + if (!VARATT_IS_EXTENDED(chunk)) + chunksize = VARSIZE(chunk) - VARHDRSZ; + else if (VARATT_IS_SHORT(chunk)) + { + /* + * could happen due to heap_form_tuple doing its thing + */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + } + else + { + /* should never happen */ + uint32 header = ((varattrib_4b *) chunk)->va_4byte.va_header; + + report_toast_corruption(ctx, ta, + psprintf("toast value %u chunk %d has invalid varlena header %0x", + ta->toast_pointer.va_valueid, + chunk_seq, header)); + return; + } + + /* + * Some checks on the data we've found + */ + if (chunk_seq > last_chunk_seq) + { + report_toast_corruption(ctx, ta, + psprintf("toast value %u chunk %d follows last expected chunk %d", + ta->toast_pointer.va_valueid, + chunk_seq, last_chunk_seq)); + return; + } + + expected_size = chunk_seq < last_chunk_seq ? TOAST_MAX_CHUNK_SIZE + : extsize - (last_chunk_seq * TOAST_MAX_CHUNK_SIZE); + + if (chunksize != expected_size) + report_toast_corruption(ctx, ta, + psprintf("toast value %u chunk %d has size %u, but expected size %u", + ta->toast_pointer.va_valueid, + chunk_seq, chunksize, expected_size)); +} + +/* + * Check the current attribute as tracked in ctx, recording any corruption + * found in ctx->tupstore. + * + * This function follows the logic performed by heap_deform_tuple(), and in the + * case of a toasted value, optionally stores the toast pointer so later it can + * be checked following the logic of detoast_external_attr(), checking for any + * conditions that would result in either of those functions Asserting or + * crashing the backend. The checks performed by Asserts present in those two + * functions are also performed here and in check_toasted_attribute. In cases + * where those two functions are a bit cavalier in their assumptions about data + * being correct, we perform additional checks not present in either of those + * two functions. Where some condition is checked in both of those functions, + * we perform it here twice, as we parallel the logical flow of those two + * functions. The presence of duplicate checks seems a reasonable price to pay + * for keeping this code tightly coupled with the code it protects. + * + * Returns true if the tuple attribute is sane enough for processing to + * continue on to the next attribute, false otherwise. + */ +static bool +check_tuple_attribute(HeapCheckContext *ctx) +{ + Datum attdatum; + struct varlena *attr; + char *tp; /* pointer to the tuple data */ + uint16 infomask; + Form_pg_attribute thisatt; + struct varatt_external toast_pointer; + + infomask = ctx->tuphdr->t_infomask; + thisatt = TupleDescAttr(RelationGetDescr(ctx->rel), ctx->attnum); + + tp = (char *) ctx->tuphdr + ctx->tuphdr->t_hoff; + + if (ctx->tuphdr->t_hoff + ctx->offset > ctx->lp_len) + { + report_corruption(ctx, + psprintf("attribute with length %u starts at offset %u beyond total tuple length %u", + thisatt->attlen, + ctx->tuphdr->t_hoff + ctx->offset, + ctx->lp_len)); + return false; + } + + /* Skip null values */ + if (infomask & HEAP_HASNULL && att_isnull(ctx->attnum, ctx->tuphdr->t_bits)) + return true; + + /* Skip non-varlena values, but update offset first */ + if (thisatt->attlen != -1) + { + ctx->offset = att_align_nominal(ctx->offset, thisatt->attalign); + ctx->offset = att_addlength_pointer(ctx->offset, thisatt->attlen, + tp + ctx->offset); + if (ctx->tuphdr->t_hoff + ctx->offset > ctx->lp_len) + { + report_corruption(ctx, + psprintf("attribute with length %u ends at offset %u beyond total tuple length %u", + thisatt->attlen, + ctx->tuphdr->t_hoff + ctx->offset, + ctx->lp_len)); + return false; + } + return true; + } + + /* Ok, we're looking at a varlena attribute. */ + ctx->offset = att_align_pointer(ctx->offset, thisatt->attalign, -1, + tp + ctx->offset); + + /* Get the (possibly corrupt) varlena datum */ + attdatum = fetchatt(thisatt, tp + ctx->offset); + + /* + * We have the datum, but we cannot decode it carelessly, as it may still + * be corrupt. + */ + + /* + * Check that VARTAG_SIZE won't hit an Assert on a corrupt va_tag before + * risking a call into att_addlength_pointer + */ + if (VARATT_IS_EXTERNAL(tp + ctx->offset)) + { + uint8 va_tag = VARTAG_EXTERNAL(tp + ctx->offset); + + if (va_tag != VARTAG_ONDISK) + { + report_corruption(ctx, + psprintf("toasted attribute has unexpected TOAST tag %u", + va_tag)); + /* We can't know where the next attribute begins */ + return false; + } + } + + /* Ok, should be safe now */ + ctx->offset = att_addlength_pointer(ctx->offset, thisatt->attlen, + tp + ctx->offset); + + if (ctx->tuphdr->t_hoff + ctx->offset > ctx->lp_len) + { + report_corruption(ctx, + psprintf("attribute with length %u ends at offset %u beyond total tuple length %u", + thisatt->attlen, + ctx->tuphdr->t_hoff + ctx->offset, + ctx->lp_len)); + + return false; + } + + /* + * heap_deform_tuple would be done with this attribute at this point, + * having stored it in values[], and would continue to the next attribute. + * We go further, because we need to check if the toast datum is corrupt. + */ + + attr = (struct varlena *) DatumGetPointer(attdatum); + + /* + * Now we follow the logic of detoast_external_attr(), with the same + * caveats about being paranoid about corruption. + */ + + /* Skip values that are not external */ + if (!VARATT_IS_EXTERNAL(attr)) + return true; + + /* It is external, and we're looking at a page on disk */ + + /* + * Must copy attr into toast_pointer for alignment considerations + */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* Toasted attributes too large to be untoasted should never be stored */ + if (toast_pointer.va_rawsize > VARLENA_SIZE_LIMIT) + report_corruption(ctx, + psprintf("toast value %u rawsize %d exceeds limit %d", + toast_pointer.va_valueid, + toast_pointer.va_rawsize, + VARLENA_SIZE_LIMIT)); + + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + { + ToastCompressionId cmid; + bool valid = false; + + /* Compressed attributes should have a valid compression method */ + cmid = TOAST_COMPRESS_METHOD(&toast_pointer); + switch (cmid) + { + /* List of all valid compression method IDs */ + case TOAST_PGLZ_COMPRESSION_ID: + case TOAST_LZ4_COMPRESSION_ID: + valid = true; + break; + + /* Recognized but invalid compression method ID */ + case TOAST_INVALID_COMPRESSION_ID: + break; + + /* Intentionally no default here */ + } + if (!valid) + report_corruption(ctx, + psprintf("toast value %u has invalid compression method id %d", + toast_pointer.va_valueid, cmid)); + } + + /* The tuple header better claim to contain toasted values */ + if (!(infomask & HEAP_HASEXTERNAL)) + { + report_corruption(ctx, + psprintf("toast value %u is external but tuple header flag HEAP_HASEXTERNAL not set", + toast_pointer.va_valueid)); + return true; + } + + /* The relation better have a toast table */ + if (!ctx->rel->rd_rel->reltoastrelid) + { + report_corruption(ctx, + psprintf("toast value %u is external but relation has no toast relation", + toast_pointer.va_valueid)); + return true; + } + + /* If we were told to skip toast checking, then we're done. */ + if (ctx->toast_rel == NULL) + return true; + + /* + * If this tuple is eligible to be pruned, we cannot check the toast. + * Otherwise, we push a copy of the toast tuple so we can check it after + * releasing the main table buffer lock. + */ + if (!ctx->tuple_could_be_pruned) + { + ToastedAttribute *ta; + + ta = (ToastedAttribute *) palloc0(sizeof(ToastedAttribute)); + + VARATT_EXTERNAL_GET_POINTER(ta->toast_pointer, attr); + ta->blkno = ctx->blkno; + ta->offnum = ctx->offnum; + ta->attnum = ctx->attnum; + ctx->toasted_attributes = lappend(ctx->toasted_attributes, ta); + } + + return true; +} + +/* + * For each attribute collected in ctx->toasted_attributes, look up the value + * in the toast table and perform checks on it. This function should only be + * called on toast pointers which cannot be vacuumed away during our + * processing. + */ +static void +check_toasted_attribute(HeapCheckContext *ctx, ToastedAttribute *ta) +{ + SnapshotData SnapshotToast; + ScanKeyData toastkey; + SysScanDesc toastscan; + bool found_toasttup; + HeapTuple toasttup; + uint32 extsize; + int32 expected_chunk_seq = 0; + int32 last_chunk_seq; + + extsize = VARATT_EXTERNAL_GET_EXTSIZE(ta->toast_pointer); + last_chunk_seq = (extsize - 1) / TOAST_MAX_CHUNK_SIZE; + + /* + * Setup a scan key to find chunks in toast table with matching va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(ta->toast_pointer.va_valueid)); + + /* + * Check if any chunks for this toasted object exist in the toast table, + * accessible via the index. + */ + init_toast_snapshot(&SnapshotToast); + toastscan = systable_beginscan_ordered(ctx->toast_rel, + ctx->valid_toast_index, + &SnapshotToast, 1, + &toastkey); + found_toasttup = false; + while ((toasttup = + systable_getnext_ordered(toastscan, + ForwardScanDirection)) != NULL) + { + found_toasttup = true; + check_toast_tuple(toasttup, ctx, ta, &expected_chunk_seq, extsize); + } + systable_endscan_ordered(toastscan); + + if (!found_toasttup) + report_toast_corruption(ctx, ta, + psprintf("toast value %u not found in toast table", + ta->toast_pointer.va_valueid)); + else if (expected_chunk_seq <= last_chunk_seq) + report_toast_corruption(ctx, ta, + psprintf("toast value %u was expected to end at chunk %d, but ended while expecting chunk %d", + ta->toast_pointer.va_valueid, + last_chunk_seq, expected_chunk_seq)); +} + +/* + * Check the current tuple as tracked in ctx, recording any corruption found in + * ctx->tupstore. + * + * We return some information about the status of xmin to aid in validating + * update chains. + */ +static void +check_tuple(HeapCheckContext *ctx, bool *xmin_commit_status_ok, + XidCommitStatus *xmin_commit_status) +{ + /* + * Check various forms of tuple header corruption, and if the header is + * too corrupt, do not continue with other checks. + */ + if (!check_tuple_header(ctx)) + return; + + /* + * Check tuple visibility. If the inserting transaction aborted, we + * cannot assume our relation description matches the tuple structure, and + * therefore cannot check it. + */ + if (!check_tuple_visibility(ctx, xmin_commit_status_ok, + xmin_commit_status)) + return; + + /* + * The tuple is visible, so it must be compatible with the current version + * of the relation descriptor. It might have fewer columns than are + * present in the relation descriptor, but it cannot have more. + */ + if (RelationGetDescr(ctx->rel)->natts < ctx->natts) + { + report_corruption(ctx, + psprintf("number of attributes %u exceeds maximum expected for table %u", + ctx->natts, + RelationGetDescr(ctx->rel)->natts)); + return; + } + + /* + * Check each attribute unless we hit corruption that confuses what to do + * next, at which point we abort further attribute checks for this tuple. + * Note that we don't abort for all types of corruption, only for those + * types where we don't know how to continue. We also don't abort the + * checking of toasted attributes collected from the tuple prior to + * aborting. Those will still be checked later along with other toasted + * attributes collected from the page. + */ + ctx->offset = 0; + for (ctx->attnum = 0; ctx->attnum < ctx->natts; ctx->attnum++) + if (!check_tuple_attribute(ctx)) + break; /* cannot continue */ + + /* revert attnum to -1 until we again examine individual attributes */ + ctx->attnum = -1; +} + +/* + * Convert a TransactionId into a FullTransactionId using our cached values of + * the valid transaction ID range. It is the caller's responsibility to have + * already updated the cached values, if necessary. + */ +static FullTransactionId +FullTransactionIdFromXidAndCtx(TransactionId xid, const HeapCheckContext *ctx) +{ + uint64 nextfxid_i; + int32 diff; + FullTransactionId fxid; + + Assert(TransactionIdIsNormal(ctx->next_xid)); + Assert(FullTransactionIdIsNormal(ctx->next_fxid)); + Assert(XidFromFullTransactionId(ctx->next_fxid) == ctx->next_xid); + + if (!TransactionIdIsNormal(xid)) + return FullTransactionIdFromEpochAndXid(0, xid); + + nextfxid_i = U64FromFullTransactionId(ctx->next_fxid); + + /* compute the 32bit modulo difference */ + diff = (int32) (ctx->next_xid - xid); + + /* + * In cases of corruption we might see a 32bit xid that is before epoch 0. + * We can't represent that as a 64bit xid, due to 64bit xids being + * unsigned integers, without the modulo arithmetic of 32bit xid. There's + * no really nice way to deal with that, but it works ok enough to use + * FirstNormalFullTransactionId in that case, as a freshly initdb'd + * cluster already has a newer horizon. + */ + if (diff > 0 && (nextfxid_i - FirstNormalTransactionId) < (int64) diff) + { + Assert(EpochFromFullTransactionId(ctx->next_fxid) == 0); + fxid = FirstNormalFullTransactionId; + } + else + fxid = FullTransactionIdFromU64(nextfxid_i - diff); + + Assert(FullTransactionIdIsNormal(fxid)); + return fxid; +} + +/* + * Update our cached range of valid transaction IDs. + */ +static void +update_cached_xid_range(HeapCheckContext *ctx) +{ + /* Make cached copies */ + LWLockAcquire(XidGenLock, LW_SHARED); + ctx->next_fxid = ShmemVariableCache->nextXid; + ctx->oldest_xid = ShmemVariableCache->oldestXid; + LWLockRelease(XidGenLock); + + /* And compute alternate versions of the same */ + ctx->next_xid = XidFromFullTransactionId(ctx->next_fxid); + ctx->oldest_fxid = FullTransactionIdFromXidAndCtx(ctx->oldest_xid, ctx); +} + +/* + * Update our cached range of valid multitransaction IDs. + */ +static void +update_cached_mxid_range(HeapCheckContext *ctx) +{ + ReadMultiXactIdRange(&ctx->oldest_mxact, &ctx->next_mxact); +} + +/* + * Return whether the given FullTransactionId is within our cached valid + * transaction ID range. + */ +static inline bool +fxid_in_cached_range(FullTransactionId fxid, const HeapCheckContext *ctx) +{ + return (FullTransactionIdPrecedesOrEquals(ctx->oldest_fxid, fxid) && + FullTransactionIdPrecedes(fxid, ctx->next_fxid)); +} + +/* + * Checks whether a multitransaction ID is in the cached valid range, returning + * the nature of the range violation, if any. + */ +static XidBoundsViolation +check_mxid_in_range(MultiXactId mxid, HeapCheckContext *ctx) +{ + if (!TransactionIdIsValid(mxid)) + return XID_INVALID; + if (MultiXactIdPrecedes(mxid, ctx->relminmxid)) + return XID_PRECEDES_RELMIN; + if (MultiXactIdPrecedes(mxid, ctx->oldest_mxact)) + return XID_PRECEDES_CLUSTERMIN; + if (MultiXactIdPrecedesOrEquals(ctx->next_mxact, mxid)) + return XID_IN_FUTURE; + return XID_BOUNDS_OK; +} + +/* + * Checks whether the given mxid is valid to appear in the heap being checked, + * returning the nature of the range violation, if any. + * + * This function attempts to return quickly by caching the known valid mxid + * range in ctx. Callers should already have performed the initial setup of + * the cache prior to the first call to this function. + */ +static XidBoundsViolation +check_mxid_valid_in_rel(MultiXactId mxid, HeapCheckContext *ctx) +{ + XidBoundsViolation result; + + result = check_mxid_in_range(mxid, ctx); + if (result == XID_BOUNDS_OK) + return XID_BOUNDS_OK; + + /* The range may have advanced. Recheck. */ + update_cached_mxid_range(ctx); + return check_mxid_in_range(mxid, ctx); +} + +/* + * Checks whether the given transaction ID is (or was recently) valid to appear + * in the heap being checked, or whether it is too old or too new to appear in + * the relation, returning information about the nature of the bounds violation. + * + * We cache the range of valid transaction IDs. If xid is in that range, we + * conclude that it is valid, even though concurrent changes to the table might + * invalidate it under certain corrupt conditions. (For example, if the table + * contains corrupt all-frozen bits, a concurrent vacuum might skip the page(s) + * containing the xid and then truncate clog and advance the relfrozenxid + * beyond xid.) Reporting the xid as valid under such conditions seems + * acceptable, since if we had checked it earlier in our scan it would have + * truly been valid at that time. + * + * If the status argument is not NULL, and if and only if the transaction ID + * appears to be valid in this relation, the status argument will be set with + * the commit status of the transaction ID. + */ +static XidBoundsViolation +get_xid_status(TransactionId xid, HeapCheckContext *ctx, + XidCommitStatus *status) +{ + FullTransactionId fxid; + FullTransactionId clog_horizon; + + /* Quick check for special xids */ + if (!TransactionIdIsValid(xid)) + return XID_INVALID; + else if (xid == BootstrapTransactionId || xid == FrozenTransactionId) + { + if (status != NULL) + *status = XID_COMMITTED; + return XID_BOUNDS_OK; + } + + /* Check if the xid is within bounds */ + fxid = FullTransactionIdFromXidAndCtx(xid, ctx); + if (!fxid_in_cached_range(fxid, ctx)) + { + /* + * We may have been checking against stale values. Update the cached + * range to be sure, and since we relied on the cached range when we + * performed the full xid conversion, reconvert. + */ + update_cached_xid_range(ctx); + fxid = FullTransactionIdFromXidAndCtx(xid, ctx); + } + + if (FullTransactionIdPrecedesOrEquals(ctx->next_fxid, fxid)) + return XID_IN_FUTURE; + if (FullTransactionIdPrecedes(fxid, ctx->oldest_fxid)) + return XID_PRECEDES_CLUSTERMIN; + if (FullTransactionIdPrecedes(fxid, ctx->relfrozenfxid)) + return XID_PRECEDES_RELMIN; + + /* Early return if the caller does not request clog checking */ + if (status == NULL) + return XID_BOUNDS_OK; + + /* Early return if we just checked this xid in a prior call */ + if (xid == ctx->cached_xid) + { + *status = ctx->cached_status; + return XID_BOUNDS_OK; + } + + *status = XID_COMMITTED; + LWLockAcquire(XactTruncationLock, LW_SHARED); + clog_horizon = + FullTransactionIdFromXidAndCtx(ShmemVariableCache->oldestClogXid, + ctx); + if (FullTransactionIdPrecedesOrEquals(clog_horizon, fxid)) + { + if (TransactionIdIsCurrentTransactionId(xid)) + *status = XID_IS_CURRENT_XID; + else if (TransactionIdIsInProgress(xid)) + *status = XID_IN_PROGRESS; + else if (TransactionIdDidCommit(xid)) + *status = XID_COMMITTED; + else + *status = XID_ABORTED; + } + LWLockRelease(XactTruncationLock); + ctx->cached_xid = xid; + ctx->cached_status = *status; + return XID_BOUNDS_OK; +} diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c new file mode 100644 index 0000000..dc7d4a5 --- /dev/null +++ b/contrib/amcheck/verify_nbtree.c @@ -0,0 +1,3347 @@ +/*------------------------------------------------------------------------- + * + * verify_nbtree.c + * Verifies the integrity of nbtree indexes based on invariants. + * + * For B-Tree indexes, verification includes checking that each page in the + * target index has items in logical order as reported by an insertion scankey + * (the insertion scankey sort-wise NULL semantics are needed for + * verification). + * + * When index-to-heap verification is requested, a Bloom filter is used to + * fingerprint all tuples in the target index, as the index is traversed to + * verify its structure. A heap scan later uses Bloom filter probes to verify + * that every visible heap tuple has a matching index tuple. + * + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/amcheck/verify_nbtree.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xact.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "catalog/pg_opfamily_d.h" +#include "commands/tablecmds.h" +#include "common/pg_prng.h" +#include "lib/bloomfilter.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" + + +PG_MODULE_MAGIC; + +/* + * A B-Tree cannot possibly have this many levels, since there must be one + * block per level, which is bound by the range of BlockNumber: + */ +#define InvalidBtreeLevel ((uint32) InvalidBlockNumber) +#define BTreeTupleGetNKeyAtts(itup, rel) \ + Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel)) + +/* + * State associated with verifying a B-Tree index + * + * target is the point of reference for a verification operation. + * + * Other B-Tree pages may be allocated, but those are always auxiliary (e.g., + * they are current target's child pages). Conceptually, problems are only + * ever found in the current target page (or for a particular heap tuple during + * heapallindexed verification). Each page found by verification's left/right, + * top/bottom scan becomes the target exactly once. + */ +typedef struct BtreeCheckState +{ + /* + * Unchanging state, established at start of verification: + */ + + /* B-Tree Index Relation and associated heap relation */ + Relation rel; + Relation heaprel; + /* rel is heapkeyspace index? */ + bool heapkeyspace; + /* ShareLock held on heap/index, rather than AccessShareLock? */ + bool readonly; + /* Also verifying heap has no unindexed tuples? */ + bool heapallindexed; + /* Also making sure non-pivot tuples can be found by new search? */ + bool rootdescend; + /* Per-page context */ + MemoryContext targetcontext; + /* Buffer access strategy */ + BufferAccessStrategy checkstrategy; + + /* + * Mutable state, for verification of particular page: + */ + + /* Current target page */ + Page target; + /* Target block number */ + BlockNumber targetblock; + /* Target page's LSN */ + XLogRecPtr targetlsn; + + /* + * Low key: high key of left sibling of target page. Used only for child + * verification. So, 'lowkey' is kept only when 'readonly' is set. + */ + IndexTuple lowkey; + + /* + * The rightlink and incomplete split flag of block one level down to the + * target page, which was visited last time via downlink from target page. + * We use it to check for missing downlinks. + */ + BlockNumber prevrightlink; + bool previncompletesplit; + + /* + * Mutable state, for optional heapallindexed verification: + */ + + /* Bloom filter fingerprints B-Tree index */ + bloom_filter *filter; + /* Debug counter */ + int64 heaptuplespresent; +} BtreeCheckState; + +/* + * Starting point for verifying an entire B-Tree index level + */ +typedef struct BtreeLevel +{ + /* Level number (0 is leaf page level). */ + uint32 level; + + /* Left most block on level. Scan of level begins here. */ + BlockNumber leftmost; + + /* Is this level reported as "true" root level by meta page? */ + bool istruerootlevel; +} BtreeLevel; + +PG_FUNCTION_INFO_V1(bt_index_check); +PG_FUNCTION_INFO_V1(bt_index_parent_check); + +static void bt_index_check_internal(Oid indrelid, bool parentcheck, + bool heapallindexed, bool rootdescend); +static inline void btree_index_checkable(Relation rel); +static inline bool btree_index_mainfork_expected(Relation rel); +static void bt_check_every_level(Relation rel, Relation heaprel, + bool heapkeyspace, bool readonly, bool heapallindexed, + bool rootdescend); +static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, + BtreeLevel level); +static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque); +static void bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent); +static void bt_target_page_check(BtreeCheckState *state); +static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); +static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, + OffsetNumber downlinkoffnum); +static void bt_child_highkey_check(BtreeCheckState *state, + OffsetNumber target_downlinkoffnum, + Page loaded_child, + uint32 target_level); +static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber blkno, Page page); +static void bt_tuple_present_callback(Relation index, ItemPointer tid, + Datum *values, bool *isnull, + bool tupleIsAlive, void *checkstate); +static IndexTuple bt_normalize_tuple(BtreeCheckState *state, + IndexTuple itup); +static inline IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n); +static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup); +static inline bool offset_is_negative_infinity(BTPageOpaque opaque, + OffsetNumber offset); +static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber upperbound); +static inline bool invariant_leq_offset(BtreeCheckState *state, + BTScanInsert key, + OffsetNumber upperbound); +static inline bool invariant_g_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber lowerbound); +static inline bool invariant_l_nontarget_offset(BtreeCheckState *state, + BTScanInsert key, + BlockNumber nontargetblock, + Page nontarget, + OffsetNumber upperbound); +static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum); +static inline BTScanInsert bt_mkscankey_pivotsearch(Relation rel, + IndexTuple itup); +static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, + Page page, OffsetNumber offset); +static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, + IndexTuple itup, bool nonpivot); +static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup); + +/* + * bt_index_check(index regclass, heapallindexed boolean) + * + * Verify integrity of B-Tree index. + * + * Acquires AccessShareLock on heap & index relations. Does not consider + * invariants that exist between parent/child pages. Optionally verifies + * that heap does not contain any unindexed or incorrectly indexed tuples. + */ +Datum +bt_index_check(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + bool heapallindexed = false; + + if (PG_NARGS() == 2) + heapallindexed = PG_GETARG_BOOL(1); + + bt_index_check_internal(indrelid, false, heapallindexed, false); + + PG_RETURN_VOID(); +} + +/* + * bt_index_parent_check(index regclass, heapallindexed boolean) + * + * Verify integrity of B-Tree index. + * + * Acquires ShareLock on heap & index relations. Verifies that downlinks in + * parent pages are valid lower bounds on child pages. Optionally verifies + * that heap does not contain any unindexed or incorrectly indexed tuples. + */ +Datum +bt_index_parent_check(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + bool heapallindexed = false; + bool rootdescend = false; + + if (PG_NARGS() >= 2) + heapallindexed = PG_GETARG_BOOL(1); + if (PG_NARGS() == 3) + rootdescend = PG_GETARG_BOOL(2); + + bt_index_check_internal(indrelid, true, heapallindexed, rootdescend); + + PG_RETURN_VOID(); +} + +/* + * Helper for bt_index_[parent_]check, coordinating the bulk of the work. + */ +static void +bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, + bool rootdescend) +{ + Oid heapid; + Relation indrel; + Relation heaprel; + LOCKMODE lockmode; + Oid save_userid; + int save_sec_context; + int save_nestlevel; + + if (parentcheck) + lockmode = ShareLock; + else + lockmode = AccessShareLock; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indrelid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + * + * In hot standby mode this will raise an error when parentcheck is true. + */ + heapid = IndexGetRelation(indrelid, true); + if (OidIsValid(heapid)) + { + heaprel = table_open(heapid, lockmode); + + /* + * Switch to the table owner's userid, so that any index functions are + * run as that user. Also lock down security-restricted operations + * and arrange to make GUC variable changes local to this command. + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(heaprel->rd_rel->relowner, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + } + else + { + heaprel = NULL; + /* Set these just to suppress "uninitialized variable" warnings */ + save_userid = InvalidOid; + save_sec_context = -1; + save_nestlevel = -1; + } + + /* + * Open the target index relations separately (like relation_openrv(), but + * with heap relation locked first to prevent deadlocking). In hot + * standby mode this will raise an error when parentcheck is true. + * + * There is no need for the usual indcheckxmin usability horizon test + * here, even in the heapallindexed case, because index undergoing + * verification only needs to have entries for a new transaction snapshot. + * (If this is a parentcheck verification, there is no question about + * committed or recently dead heap tuples lacking index entries due to + * concurrent activity.) + */ + indrel = index_open(indrelid, lockmode); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. + */ + if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index \"%s\"", + RelationGetRelationName(indrel)))); + + /* Relation suitable for checking as B-Tree? */ + btree_index_checkable(indrel); + + if (btree_index_mainfork_expected(indrel)) + { + bool heapkeyspace, + allequalimage; + + if (!smgrexists(RelationGetSmgr(indrel), MAIN_FORKNUM)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" lacks a main relation fork", + RelationGetRelationName(indrel)))); + + /* Extract metadata from metapage, and sanitize it in passing */ + _bt_metaversion(indrel, &heapkeyspace, &allequalimage); + if (allequalimage && !heapkeyspace) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" metapage has equalimage field set on unsupported nbtree version", + RelationGetRelationName(indrel)))); + if (allequalimage && !_bt_allequalimage(indrel, false)) + { + bool has_interval_ops = false; + + for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(indrel); i++) + if (indrel->rd_opfamily[i] == INTERVAL_BTREE_FAM_OID) + has_interval_ops = true; + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" metapage incorrectly indicates that deduplication is safe", + RelationGetRelationName(indrel)), + has_interval_ops + ? errhint("This is known of \"interval\" indexes last built on a version predating 2023-11.") + : 0)); + } + + /* Check index, possibly against table it is an index on */ + bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, + heapallindexed, rootdescend); + } + + /* Roll back any GUC changes executed by index functions */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + /* + * Release locks early. That's ok here because nothing in the called + * routines will trigger shared cache invalidations to be sent, so we can + * relax the usual pattern of only releasing locks after commit. + */ + index_close(indrel, lockmode); + if (heaprel) + table_close(heaprel, lockmode); +} + +/* + * Basic checks about the suitability of a relation for checking as a B-Tree + * index. + * + * NB: Intentionally not checking permissions, the function is normally not + * callable by non-superusers. If granted, it's useful to be able to check a + * whole cluster. + */ +static inline void +btree_index_checkable(Relation rel) +{ + if (rel->rd_rel->relkind != RELKIND_INDEX || + rel->rd_rel->relam != BTREE_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only B-Tree indexes are supported as targets for verification"), + errdetail("Relation \"%s\" is not a B-Tree index.", + RelationGetRelationName(rel)))); + + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"), + errdetail("Index \"%s\" is associated with temporary relation.", + RelationGetRelationName(rel)))); + + if (!rel->rd_index->indisvalid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot check index \"%s\"", + RelationGetRelationName(rel)), + errdetail("Index is not valid."))); +} + +/* + * Check if B-Tree index relation should have a file for its main relation + * fork. Verification uses this to skip unlogged indexes when in hot standby + * mode, where there is simply nothing to verify. We behave as if the + * relation is empty. + * + * NB: Caller should call btree_index_checkable() before calling here. + */ +static inline bool +btree_index_mainfork_expected(Relation rel) +{ + if (rel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED || + !RecoveryInProgress()) + return true; + + ereport(DEBUG1, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot verify unlogged index \"%s\" during recovery, skipping", + RelationGetRelationName(rel)))); + + return false; +} + +/* + * Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in + * logical order, verifying invariants as it goes. Optionally, verification + * checks if the heap relation contains any tuples that are not represented in + * the index but should be. + * + * It is the caller's responsibility to acquire appropriate heavyweight lock on + * the index relation, and advise us if extra checks are safe when a ShareLock + * is held. (A lock of the same type must also have been acquired on the heap + * relation.) + * + * A ShareLock is generally assumed to prevent any kind of physical + * modification to the index structure, including modifications that VACUUM may + * make. This does not include setting of the LP_DEAD bit by concurrent index + * scans, although that is just metadata that is not able to directly affect + * any check performed here. Any concurrent process that might act on the + * LP_DEAD bit being set (recycle space) requires a heavyweight lock that + * cannot be held while we hold a ShareLock. (Besides, even if that could + * happen, the ad-hoc recycling when a page might otherwise split is performed + * per-page, and requires an exclusive buffer lock, which wouldn't cause us + * trouble. _bt_delitems_vacuum() may only delete leaf items, and so the extra + * parent/child check cannot be affected.) + */ +static void +bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, + bool readonly, bool heapallindexed, bool rootdescend) +{ + BtreeCheckState *state; + Page metapage; + BTMetaPageData *metad; + uint32 previouslevel; + BtreeLevel current; + Snapshot snapshot = SnapshotAny; + + if (!readonly) + elog(DEBUG1, "verifying consistency of tree structure for index \"%s\"", + RelationGetRelationName(rel)); + else + elog(DEBUG1, "verifying consistency of tree structure for index \"%s\" with cross-level checks", + RelationGetRelationName(rel)); + + /* + * This assertion matches the one in index_getnext_tid(). See page + * recycling/"visible to everyone" notes in nbtree README. + */ + Assert(TransactionIdIsValid(RecentXmin)); + + /* + * Initialize state for entire verification operation + */ + state = palloc0(sizeof(BtreeCheckState)); + state->rel = rel; + state->heaprel = heaprel; + state->heapkeyspace = heapkeyspace; + state->readonly = readonly; + state->heapallindexed = heapallindexed; + state->rootdescend = rootdescend; + + if (state->heapallindexed) + { + int64 total_pages; + int64 total_elems; + uint64 seed; + + /* + * Size Bloom filter based on estimated number of tuples in index, + * while conservatively assuming that each block must contain at least + * MaxTIDsPerBTreePage / 3 "plain" tuples -- see + * bt_posting_plain_tuple() for definition, and details of how posting + * list tuples are handled. + */ + total_pages = RelationGetNumberOfBlocks(rel); + total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3), + (int64) state->rel->rd_rel->reltuples); + /* Generate a random seed to avoid repetition */ + seed = pg_prng_uint64(&pg_global_prng_state); + /* Create Bloom filter to fingerprint index */ + state->filter = bloom_create(total_elems, maintenance_work_mem, seed); + state->heaptuplespresent = 0; + + /* + * Register our own snapshot in !readonly case, rather than asking + * table_index_build_scan() to do this for us later. This needs to + * happen before index fingerprinting begins, so we can later be + * certain that index fingerprinting should have reached all tuples + * returned by table_index_build_scan(). + */ + if (!state->readonly) + { + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + + /* + * GetTransactionSnapshot() always acquires a new MVCC snapshot in + * READ COMMITTED mode. A new snapshot is guaranteed to have all + * the entries it requires in the index. + * + * We must defend against the possibility that an old xact + * snapshot was returned at higher isolation levels when that + * snapshot is not safe for index scans of the target index. This + * is possible when the snapshot sees tuples that are before the + * index's indcheckxmin horizon. Throwing an error here should be + * very rare. It doesn't seem worth using a secondary snapshot to + * avoid this. + */ + if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data), + snapshot->xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("index \"%s\" cannot be verified using transaction snapshot", + RelationGetRelationName(rel)))); + } + } + + Assert(!state->rootdescend || state->readonly); + if (state->rootdescend && !state->heapkeyspace) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search", + RelationGetRelationName(rel)), + errhint("Only B-Tree version 4 indexes support rootdescend verification."))); + + /* Create context for page */ + state->targetcontext = AllocSetContextCreate(CurrentMemoryContext, + "amcheck context", + ALLOCSET_DEFAULT_SIZES); + state->checkstrategy = GetAccessStrategy(BAS_BULKREAD); + + /* Get true root block from meta-page */ + metapage = palloc_btree_page(state, BTREE_METAPAGE); + metad = BTPageGetMeta(metapage); + + /* + * Certain deletion patterns can result in "skinny" B-Tree indexes, where + * the fast root and true root differ. + * + * Start from the true root, not the fast root, unlike conventional index + * scans. This approach is more thorough, and removes the risk of + * following a stale fast root from the meta page. + */ + if (metad->btm_fastroot != metad->btm_root) + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("harmless fast root mismatch in index \"%s\"", + RelationGetRelationName(rel)), + errdetail_internal("Fast root block %u (level %u) differs from true root block %u (level %u).", + metad->btm_fastroot, metad->btm_fastlevel, + metad->btm_root, metad->btm_level))); + + /* + * Starting at the root, verify every level. Move left to right, top to + * bottom. Note that there may be no pages other than the meta page (meta + * page can indicate that root is P_NONE when the index is totally empty). + */ + previouslevel = InvalidBtreeLevel; + current.level = metad->btm_level; + current.leftmost = metad->btm_root; + current.istruerootlevel = true; + while (current.leftmost != P_NONE) + { + /* + * Verify this level, and get left most page for next level down, if + * not at leaf level + */ + current = bt_check_level_from_leftmost(state, current); + + if (current.leftmost == InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has no valid pages on level below %u or first level", + RelationGetRelationName(rel), previouslevel))); + + previouslevel = current.level; + } + + /* + * * Check whether heap contains unindexed/malformed tuples * + */ + if (state->heapallindexed) + { + IndexInfo *indexinfo = BuildIndexInfo(state->rel); + TableScanDesc scan; + + /* + * Create our own scan for table_index_build_scan(), rather than + * getting it to do so for us. This is required so that we can + * actually use the MVCC snapshot registered earlier in !readonly + * case. + * + * Note that table_index_build_scan() calls heap_endscan() for us. + */ + scan = table_beginscan_strat(state->heaprel, /* relation */ + snapshot, /* snapshot */ + 0, /* number of keys */ + NULL, /* scan key */ + true, /* buffer access strategy OK */ + true); /* syncscan OK? */ + + /* + * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY + * behaves in !readonly case. + * + * It's okay that we don't actually use the same lock strength for the + * heap relation as any other ii_Concurrent caller would in !readonly + * case. We have no reason to care about a concurrent VACUUM + * operation, since there isn't going to be a second scan of the heap + * that needs to be sure that there was no concurrent recycling of + * TIDs. + */ + indexinfo->ii_Concurrent = !state->readonly; + + /* + * Don't wait for uncommitted tuple xact commit/abort when index is a + * unique index on a catalog (or an index used by an exclusion + * constraint). This could otherwise happen in the readonly case. + */ + indexinfo->ii_Unique = false; + indexinfo->ii_ExclusionOps = NULL; + indexinfo->ii_ExclusionProcs = NULL; + indexinfo->ii_ExclusionStrats = NULL; + + elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"", + RelationGetRelationName(state->rel), + RelationGetRelationName(state->heaprel)); + + table_index_build_scan(state->heaprel, state->rel, indexinfo, true, false, + bt_tuple_present_callback, (void *) state, scan); + + ereport(DEBUG1, + (errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set", + state->heaptuplespresent, RelationGetRelationName(heaprel), + 100.0 * bloom_prop_bits_set(state->filter)))); + + if (snapshot != SnapshotAny) + UnregisterSnapshot(snapshot); + + bloom_free(state->filter); + } + + /* Be tidy: */ + MemoryContextDelete(state->targetcontext); +} + +/* + * Given a left-most block at some level, move right, verifying each page + * individually (with more verification across pages for "readonly" + * callers). Caller should pass the true root page as the leftmost initially, + * working their way down by passing what is returned for the last call here + * until level 0 (leaf page level) was reached. + * + * Returns state for next call, if any. This includes left-most block number + * one level lower that should be passed on next level/call, which is set to + * P_NONE on last call here (when leaf level is verified). Level numbers + * follow the nbtree convention: higher levels have higher numbers, because new + * levels are added only due to a root page split. Note that prior to the + * first root page split, the root is also a leaf page, so there is always a + * level 0 (leaf level), and it's always the last level processed. + * + * Note on memory management: State's per-page context is reset here, between + * each call to bt_target_page_check(). + */ +static BtreeLevel +bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) +{ + /* State to establish early, concerning entire level */ + BTPageOpaque opaque; + MemoryContext oldcontext; + BtreeLevel nextleveldown; + + /* Variables for iterating across level using right links */ + BlockNumber leftcurrent = P_NONE; + BlockNumber current = level.leftmost; + + /* Initialize return state */ + nextleveldown.leftmost = InvalidBlockNumber; + nextleveldown.level = InvalidBtreeLevel; + nextleveldown.istruerootlevel = false; + + /* Use page-level context for duration of this call */ + oldcontext = MemoryContextSwitchTo(state->targetcontext); + + elog(DEBUG1, "verifying level %u%s", level.level, + level.istruerootlevel ? + " (true root level)" : level.level == 0 ? " (leaf level)" : ""); + + state->prevrightlink = InvalidBlockNumber; + state->previncompletesplit = false; + + do + { + /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */ + CHECK_FOR_INTERRUPTS(); + + /* Initialize state for this iteration */ + state->targetblock = current; + state->target = palloc_btree_page(state, state->targetblock); + state->targetlsn = PageGetLSN(state->target); + + opaque = BTPageGetOpaque(state->target); + + if (P_IGNORE(opaque)) + { + /* + * Since there cannot be a concurrent VACUUM operation in readonly + * mode, and since a page has no links within other pages + * (siblings and parent) once it is marked fully deleted, it + * should be impossible to land on a fully deleted page in + * readonly mode. See bt_child_check() for further details. + * + * The bt_child_check() P_ISDELETED() check is repeated here so + * that pages that are only reachable through sibling links get + * checked. + */ + if (state->readonly && P_ISDELETED(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("downlink or sibling link points to deleted block in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u left block=%u left link from block=%u.", + current, leftcurrent, opaque->btpo_prev))); + + if (P_RIGHTMOST(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u fell off the end of index \"%s\"", + current, RelationGetRelationName(state->rel)))); + else + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("block %u of index \"%s\" concurrently deleted", + current, RelationGetRelationName(state->rel)))); + goto nextpage; + } + else if (nextleveldown.leftmost == InvalidBlockNumber) + { + /* + * A concurrent page split could make the caller supplied leftmost + * block no longer contain the leftmost page, or no longer be the + * true root, but where that isn't possible due to heavyweight + * locking, check that the first valid page meets caller's + * expectations. + */ + if (state->readonly) + { + if (!bt_leftmost_ignoring_half_dead(state, current, opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u is not leftmost in index \"%s\"", + current, RelationGetRelationName(state->rel)))); + + if (level.istruerootlevel && !P_ISROOT(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u is not true root in index \"%s\"", + current, RelationGetRelationName(state->rel)))); + } + + /* + * Before beginning any non-trivial examination of level, prepare + * state for next bt_check_level_from_leftmost() invocation for + * the next level for the next level down (if any). + * + * There should be at least one non-ignorable page per level, + * unless this is the leaf level, which is assumed by caller to be + * final level. + */ + if (!P_ISLEAF(opaque)) + { + IndexTuple itup; + ItemId itemid; + + /* Internal page -- downlink gets leftmost on next level */ + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, + P_FIRSTDATAKEY(opaque)); + itup = (IndexTuple) PageGetItem(state->target, itemid); + nextleveldown.leftmost = BTreeTupleGetDownLink(itup); + nextleveldown.level = opaque->btpo_level - 1; + } + else + { + /* + * Leaf page -- final level caller must process. + * + * Note that this could also be the root page, if there has + * been no root page split yet. + */ + nextleveldown.leftmost = P_NONE; + nextleveldown.level = InvalidBtreeLevel; + } + + /* + * Finished setting up state for this call/level. Control will + * never end up back here in any future loop iteration for this + * level. + */ + } + + /* + * Sibling links should be in mutual agreement. There arises + * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling + * of the parent's low-key downlink is half-dead. (A half-dead page + * has no downlink from its parent.) Under heavyweight locking, the + * last bt_leftmost_ignoring_half_dead() validated this btpo_prev. + * Without heavyweight locking, validation of the P_NONE case remains + * unimplemented. + */ + if (opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE) + bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); + + /* Check level */ + if (level.level != opaque->btpo_level) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down", + RelationGetRelationName(state->rel)), + errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", + current, level.level, opaque->btpo_level))); + + /* Verify invariants for page */ + bt_target_page_check(state); + +nextpage: + + /* Try to detect circular links */ + if (current == leftcurrent || current == opaque->btpo_prev) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("circular link chain found in block %u of index \"%s\"", + current, RelationGetRelationName(state->rel)))); + + leftcurrent = current; + current = opaque->btpo_next; + + if (state->lowkey) + { + Assert(state->readonly); + pfree(state->lowkey); + state->lowkey = NULL; + } + + /* + * Copy current target high key as the low key of right sibling. + * Allocate memory in upper level context, so it would be cleared + * after reset of target context. + * + * We only need the low key in corner cases of checking child high + * keys. We use high key only when incomplete split on the child level + * falls to the boundary of pages on the target level. See + * bt_child_highkey_check() for details. So, typically we won't end + * up doing anything with low key, but it's simpler for general case + * high key verification to always have it available. + * + * The correctness of managing low key in the case of concurrent + * splits wasn't investigated yet. Thankfully we only need low key + * for readonly verification and concurrent splits won't happen. + */ + if (state->readonly && !P_RIGHTMOST(opaque)) + { + IndexTuple itup; + ItemId itemid; + + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, P_HIKEY); + itup = (IndexTuple) PageGetItem(state->target, itemid); + + state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup)); + memcpy(state->lowkey, itup, IndexTupleSize(itup)); + } + + /* Free page and associated memory for this iteration */ + MemoryContextReset(state->targetcontext); + } + while (current != P_NONE); + + if (state->lowkey) + { + Assert(state->readonly); + pfree(state->lowkey); + state->lowkey = NULL; + } + + /* Don't change context for caller */ + MemoryContextSwitchTo(oldcontext); + + return nextleveldown; +} + +/* + * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of + * half-dead, sibling-linked pages to the left. If a half-dead page appears + * under state->readonly, the database exited recovery between the first-stage + * and second-stage WAL records of a deletion. + */ +static bool +bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque) +{ + BlockNumber reached = start_opaque->btpo_prev, + reached_from = start; + bool all_half_dead = true; + + /* + * To handle the !readonly case, we'd need to accept BTP_DELETED pages and + * potentially observe nbtree/README "Page deletion and backwards scans". + */ + Assert(state->readonly); + + while (reached != P_NONE && all_half_dead) + { + Page page = palloc_btree_page(state, reached); + BTPageOpaque reached_opaque = BTPageGetOpaque(page); + + CHECK_FOR_INTERRUPTS(); + + /* + * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page() + * writes that side-links will continue to point to the siblings. + * Check btpo_next for that property. + */ + all_half_dead = P_ISHALFDEAD(reached_opaque) && + reached != start && + reached != reached_from && + reached_opaque->btpo_next == reached_from; + if (all_half_dead) + { + XLogRecPtr pagelsn = PageGetLSN(page); + + /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */ + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("harmless interrupted page deletion detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u right block=%u page lsn=%X/%X.", + reached, reached_from, + LSN_FORMAT_ARGS(pagelsn)))); + + reached_from = reached; + reached = reached_opaque->btpo_prev; + } + + pfree(page); + } + + return all_half_dead; +} + +/* + * Raise an error when target page's left link does not point back to the + * previous target page, called leftcurrent here. The leftcurrent page's + * right link was followed to get to the current target page, and we expect + * mutual agreement among leftcurrent and the current target page. Make sure + * that this condition has definitely been violated in the !readonly case, + * where concurrent page splits are something that we need to deal with. + * + * Cross-page inconsistencies involving pages that don't agree about being + * siblings are known to be a particularly good indicator of corruption + * involving partial writes/lost updates. The bt_right_page_check_scankey + * check also provides a way of detecting cross-page inconsistencies for + * !readonly callers, but it can only detect sibling pages that have an + * out-of-order keyspace, which can't catch many of the problems that we + * expect to catch here. + * + * The classic example of the kind of inconsistency that we can only catch + * with this check (when in !readonly mode) involves three sibling pages that + * were affected by a faulty page split at some point in the past. The + * effects of the split are reflected in the original page and its new right + * sibling page, with a lack of any accompanying changes for the _original_ + * right sibling page. The original right sibling page's left link fails to + * point to the new right sibling page (its left link still points to the + * original page), even though the first phase of a page split is supposed to + * work as a single atomic action. This subtle inconsistency will probably + * only break backwards scans in practice. + * + * Note that this is the only place where amcheck will "couple" buffer locks + * (and only for !readonly callers). In general we prefer to avoid more + * thorough cross-page checks in !readonly mode, but it seems worth the + * complexity here. Also, the performance overhead of performing lock + * coupling here is negligible in practice. Control only reaches here with a + * non-corrupt index when there is a concurrent page split at the instant + * caller crossed over to target page from leftcurrent page. + */ +static void +bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent) +{ + /* passing metapage to BTPageGetOpaque() would give irrelevant findings */ + Assert(leftcurrent != P_NONE); + + if (!state->readonly) + { + Buffer lbuf; + Buffer newtargetbuf; + Page page; + BTPageOpaque opaque; + BlockNumber newtargetblock; + + /* Couple locks in the usual order for nbtree: Left to right */ + lbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, leftcurrent, + RBM_NORMAL, state->checkstrategy); + LockBuffer(lbuf, BT_READ); + _bt_checkpage(state->rel, lbuf); + page = BufferGetPage(lbuf); + opaque = BTPageGetOpaque(page); + if (P_ISDELETED(opaque)) + { + /* + * Cannot reason about concurrently deleted page -- the left link + * in the page to the right is expected to point to some other + * page to the left (not leftcurrent page). + * + * Note that we deliberately don't give up with a half-dead page. + */ + UnlockReleaseBuffer(lbuf); + return; + } + + newtargetblock = opaque->btpo_next; + /* Avoid self-deadlock when newtargetblock == leftcurrent */ + if (newtargetblock != leftcurrent) + { + newtargetbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, + newtargetblock, RBM_NORMAL, + state->checkstrategy); + LockBuffer(newtargetbuf, BT_READ); + _bt_checkpage(state->rel, newtargetbuf); + page = BufferGetPage(newtargetbuf); + opaque = BTPageGetOpaque(page); + /* btpo_prev_from_target may have changed; update it */ + btpo_prev_from_target = opaque->btpo_prev; + } + else + { + /* + * leftcurrent right sibling points back to leftcurrent block. + * Index is corrupt. Easiest way to handle this is to pretend + * that we actually read from a distinct page that has an invalid + * block number in its btpo_prev. + */ + newtargetbuf = InvalidBuffer; + btpo_prev_from_target = InvalidBlockNumber; + } + + /* + * No need to check P_ISDELETED here, since new target block cannot be + * marked deleted as long as we hold a lock on lbuf + */ + if (BufferIsValid(newtargetbuf)) + UnlockReleaseBuffer(newtargetbuf); + UnlockReleaseBuffer(lbuf); + + if (btpo_prev_from_target == leftcurrent) + { + /* Report split in left sibling, not target (or new target) */ + ereport(DEBUG1, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("harmless concurrent page split detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u new right sibling=%u original right sibling=%u.", + leftcurrent, newtargetblock, + state->targetblock))); + return; + } + + /* + * Index is corrupt. Make sure that we report correct target page. + * + * This could have changed in cases where there was a concurrent page + * split, as well as index corruption (at least in theory). Note that + * btpo_prev_from_target was already updated above. + */ + state->targetblock = newtargetblock; + } + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("left link/right link pair in index \"%s\" not in agreement", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u left block=%u left link from block=%u.", + state->targetblock, leftcurrent, + btpo_prev_from_target))); +} + +/* + * Function performs the following checks on target page, or pages ancillary to + * target page: + * + * - That every "real" data item is less than or equal to the high key, which + * is an upper bound on the items on the page. Data items should be + * strictly less than the high key when the page is an internal page. + * + * - That within the page, every data item is strictly less than the item + * immediately to its right, if any (i.e., that the items are in order + * within the page, so that the binary searches performed by index scans are + * sane). + * + * - That the last data item stored on the page is strictly less than the + * first data item on the page to the right (when such a first item is + * available). + * + * - Various checks on the structure of tuples themselves. For example, check + * that non-pivot tuples have no truncated attributes. + * + * Furthermore, when state passed shows ShareLock held, function also checks: + * + * - That all child pages respect strict lower bound from parent's pivot + * tuple. + * + * - That downlink to block was encountered in parent where that's expected. + * + * - That high keys of child pages matches corresponding pivot keys in parent. + * + * This is also where heapallindexed callers use their Bloom filter to + * fingerprint IndexTuples for later table_index_build_scan() verification. + * + * Note: Memory allocated in this routine is expected to be released by caller + * resetting state->targetcontext. + */ +static void +bt_target_page_check(BtreeCheckState *state) +{ + OffsetNumber offset; + OffsetNumber max; + BTPageOpaque topaque; + + topaque = BTPageGetOpaque(state->target); + max = PageGetMaxOffsetNumber(state->target); + + elog(DEBUG2, "verifying %u items on %s block %u", max, + P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock); + + /* + * Check the number of attributes in high key. Note, rightmost page + * doesn't contain a high key, so nothing to check + */ + if (!P_RIGHTMOST(topaque)) + { + ItemId itemid; + IndexTuple itup; + + /* Verify line pointer before checking tuple */ + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, P_HIKEY); + if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target, + P_HIKEY)) + { + itup = (IndexTuple) PageGetItem(state->target, itemid); + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("wrong number of high key index tuple attributes in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%X.", + state->targetblock, + BTreeTupleGetNAtts(itup, state->rel), + P_ISLEAF(topaque) ? "heap" : "index", + LSN_FORMAT_ARGS(state->targetlsn)))); + } + } + + /* + * Loop over page items, starting from first non-highkey item, not high + * key (if any). Most tests are not performed for the "negative infinity" + * real item (if any). + */ + for (offset = P_FIRSTDATAKEY(topaque); + offset <= max; + offset = OffsetNumberNext(offset)) + { + ItemId itemid; + IndexTuple itup; + size_t tupsize; + BTScanInsert skey; + bool lowersizelimit; + ItemPointer scantid; + + CHECK_FOR_INTERRUPTS(); + + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, offset); + itup = (IndexTuple) PageGetItem(state->target, itemid); + tupsize = IndexTupleSize(itup); + + /* + * lp_len should match the IndexTuple reported length exactly, since + * lp_len is completely redundant in indexes, and both sources of + * tuple length are MAXALIGN()'d. nbtree does not use lp_len all that + * frequently, and is surprisingly tolerant of corrupt lp_len fields. + */ + if (tupsize != ItemIdGetLength(itemid)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index tuple size does not equal lp_len in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.", + state->targetblock, offset, + tupsize, ItemIdGetLength(itemid), + LSN_FORMAT_ARGS(state->targetlsn)), + errhint("This could be a torn page problem."))); + + /* Check the number of index tuple attributes */ + if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target, + offset)) + { + ItemPointer tid; + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetPointsToTID(itup); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("wrong number of index tuple attributes in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%X.", + itid, + BTreeTupleGetNAtts(itup, state->rel), + P_ISLEAF(topaque) ? "heap" : "index", + htid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + /* + * Don't try to generate scankey using "negative infinity" item on + * internal pages. They are always truncated to zero attributes. + */ + if (offset_is_negative_infinity(topaque, offset)) + { + /* + * We don't call bt_child_check() for "negative infinity" items. + * But if we're performing downlink connectivity check, we do it + * for every item including "negative infinity" one. + */ + if (!P_ISLEAF(topaque) && state->readonly) + { + bt_child_highkey_check(state, + offset, + NULL, + topaque->btpo_level); + } + continue; + } + + /* + * Readonly callers may optionally verify that non-pivot tuples can + * each be found by an independent search that starts from the root. + * Note that we deliberately don't do individual searches for each + * TID, since the posting list itself is validated by other checks. + */ + if (state->rootdescend && P_ISLEAF(topaque) && + !bt_rootdescend(state, itup)) + { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("could not find tuple using search from root page in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + itid, htid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + /* + * If tuple is a posting list tuple, make sure posting list TIDs are + * in order + */ + if (BTreeTupleIsPosting(itup)) + { + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + + current = BTreeTupleGetPostingN(itup, i); + + if (ItemPointerCompare(current, &last) <= 0) + { + char *itid = psprintf("(%u,%u)", state->targetblock, offset); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("posting list contains misplaced TID in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.", + itid, i, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + ItemPointerCopy(current, &last); + } + } + + /* Build insertion scankey for current page offset */ + skey = bt_mkscankey_pivotsearch(state->rel, itup); + + /* + * Make sure tuple size does not exceed the relevant BTREE_VERSION + * specific limit. + * + * BTREE_VERSION 4 (which introduced heapkeyspace rules) requisitioned + * a small amount of space from BTMaxItemSize() in order to ensure + * that suffix truncation always has enough space to add an explicit + * heap TID back to a tuple -- we pessimistically assume that every + * newly inserted tuple will eventually need to have a heap TID + * appended during a future leaf page split, when the tuple becomes + * the basis of the new high key (pivot tuple) for the leaf page. + * + * Since the reclaimed space is reserved for that purpose, we must not + * enforce the slightly lower limit when the extra space has been used + * as intended. In other words, there is only a cross-version + * difference in the limit on tuple size within leaf pages. + * + * Still, we're particular about the details within BTREE_VERSION 4 + * internal pages. Pivot tuples may only use the extra space for its + * designated purpose. Enforce the lower limit for pivot tuples when + * an explicit heap TID isn't actually present. (In all other cases + * suffix truncation is guaranteed to generate a pivot tuple that's no + * larger than the firstright tuple provided to it by its caller.) + */ + lowersizelimit = skey->heapkeyspace && + (P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL); + if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) : + BTMaxItemSizeNoHeapTid(state->target))) + { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index row size %zu exceeds maximum for index \"%s\"", + tupsize, RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + /* Fingerprint leaf page tuples (those that point to the heap) */ + if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid)) + { + IndexTuple norm; + + if (BTreeTupleIsPosting(itup)) + { + /* Fingerprint all elements as distinct "plain" tuples */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + IndexTuple logtuple; + + logtuple = bt_posting_plain_tuple(itup, i); + norm = bt_normalize_tuple(state, logtuple); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != logtuple) + pfree(norm); + pfree(logtuple); + } + } + else + { + norm = bt_normalize_tuple(state, itup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != itup) + pfree(norm); + } + } + + /* + * * High key check * + * + * If there is a high key (if this is not the rightmost page on its + * entire level), check that high key actually is upper bound on all + * page items. If this is a posting list tuple, we'll need to set + * scantid to be highest TID in posting list. + * + * We prefer to check all items against high key rather than checking + * just the last and trusting that the operator class obeys the + * transitive law (which implies that all previous items also + * respected the high key invariant if they pass the item order + * check). + * + * Ideally, we'd compare every item in the index against every other + * item in the index, and not trust opclass obedience of the + * transitive law to bridge the gap between children and their + * grandparents (as well as great-grandparents, and so on). We don't + * go to those lengths because that would be prohibitively expensive, + * and probably not markedly more effective in practice. + * + * On the leaf level, we check that the key is <= the highkey. + * However, on non-leaf levels we check that the key is < the highkey, + * because the high key is "just another separator" rather than a copy + * of some existing key item; we expect it to be unique among all keys + * on the same level. (Suffix truncation will sometimes produce a + * leaf highkey that is an untruncated copy of the lastleft item, but + * never any other item, which necessitates weakening the leaf level + * check to <=.) + * + * Full explanation for why a highkey is never truly a copy of another + * item from the same level on internal levels: + * + * While the new left page's high key is copied from the first offset + * on the right page during an internal page split, that's not the + * full story. In effect, internal pages are split in the middle of + * the firstright tuple, not between the would-be lastleft and + * firstright tuples: the firstright key ends up on the left side as + * left's new highkey, and the firstright downlink ends up on the + * right side as right's new "negative infinity" item. The negative + * infinity tuple is truncated to zero attributes, so we're only left + * with the downlink. In other words, the copying is just an + * implementation detail of splitting in the middle of a (pivot) + * tuple. (See also: "Notes About Data Representation" in the nbtree + * README.) + */ + scantid = skey->scantid; + if (state->heapkeyspace && BTreeTupleIsPosting(itup)) + skey->scantid = BTreeTupleGetMaxHeapTID(itup); + + if (!P_RIGHTMOST(topaque) && + !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) : + invariant_l_offset(state, skey, P_HIKEY))) + { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("high key invariant violated for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + /* Reset, in case scantid was set to (itup) posting tuple's max TID */ + skey->scantid = scantid; + + /* + * * Item order check * + * + * Check that items are stored on page in logical order, by checking + * current item is strictly less than next item (if any). + */ + if (OffsetNumberNext(offset) <= max && + !invariant_l_offset(state, skey, OffsetNumberNext(offset))) + { + ItemPointer tid; + char *itid, + *htid, + *nitid, + *nhtid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetPointsToTID(itup); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + nitid = psprintf("(%u,%u)", state->targetblock, + OffsetNumberNext(offset)); + + /* Reuse itup to get pointed-to heap location of second item */ + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, + OffsetNumberNext(offset)); + itup = (IndexTuple) PageGetItem(state->target, itemid); + tid = BTreeTupleGetPointsToTID(itup); + nhtid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("item order invariant violated for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Lower index tid=%s (points to %s tid=%s) " + "higher index tid=%s (points to %s tid=%s) " + "page lsn=%X/%X.", + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + nitid, + P_ISLEAF(topaque) ? "heap" : "index", + nhtid, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + /* + * * Last item check * + * + * Check last item against next/right page's first data item's when + * last item on page is reached. This additional check will detect + * transposed pages iff the supposed right sibling page happens to + * belong before target in the key space. (Otherwise, a subsequent + * heap verification will probably detect the problem.) + * + * This check is similar to the item order check that will have + * already been performed for every other "real" item on target page + * when last item is checked. The difference is that the next item + * (the item that is compared to target's last item) needs to come + * from the next/sibling page. There may not be such an item + * available from sibling for various reasons, though (e.g., target is + * the rightmost page on level). + */ + else if (offset == max) + { + BTScanInsert rightkey; + + /* Get item in next/right page */ + rightkey = bt_right_page_check_scankey(state); + + if (rightkey && + !invariant_g_offset(state, rightkey, max)) + { + /* + * As explained at length in bt_right_page_check_scankey(), + * there is a known !readonly race that could account for + * apparent violation of invariant, which we must check for + * before actually proceeding with raising error. Our canary + * condition is that target page was deleted. + */ + if (!state->readonly) + { + /* Get fresh copy of target page */ + state->target = palloc_btree_page(state, state->targetblock); + /* Note that we deliberately do not update target LSN */ + topaque = BTPageGetOpaque(state->target); + + /* + * All !readonly checks now performed; just return + */ + if (P_IGNORE(topaque)) + return; + } + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("cross page item order invariant violated for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%X.", + state->targetblock, offset, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + } + + /* + * * Downlink check * + * + * Additional check of child items iff this is an internal page and + * caller holds a ShareLock. This happens for every downlink (item) + * in target excluding the negative-infinity downlink (again, this is + * because it has no useful value to compare). + */ + if (!P_ISLEAF(topaque) && state->readonly) + bt_child_check(state, skey, offset); + } + + /* + * Special case bt_child_highkey_check() call + * + * We don't pass a real downlink, but we've to finish the level + * processing. If condition is satisfied, we've already processed all the + * downlinks from the target level. But there still might be pages to the + * right of the child page pointer to by our rightmost downlink. And they + * might have missing downlinks. This final call checks for them. + */ + if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) + { + bt_child_highkey_check(state, InvalidOffsetNumber, + NULL, topaque->btpo_level); + } +} + +/* + * Return a scankey for an item on page to right of current target (or the + * first non-ignorable page), sufficient to check ordering invariant on last + * item in current target page. Returned scankey relies on local memory + * allocated for the child page, which caller cannot pfree(). Caller's memory + * context should be reset between calls here. + * + * This is the first data item, and so all adjacent items are checked against + * their immediate sibling item (which may be on a sibling page, or even a + * "cousin" page at parent boundaries where target's rightlink points to page + * with different parent page). If no such valid item is available, return + * NULL instead. + * + * Note that !readonly callers must reverify that target page has not + * been concurrently deleted. + */ +static BTScanInsert +bt_right_page_check_scankey(BtreeCheckState *state) +{ + BTPageOpaque opaque; + ItemId rightitem; + IndexTuple firstitup; + BlockNumber targetnext; + Page rightpage; + OffsetNumber nline; + + /* Determine target's next block number */ + opaque = BTPageGetOpaque(state->target); + + /* If target is already rightmost, no right sibling; nothing to do here */ + if (P_RIGHTMOST(opaque)) + return NULL; + + /* + * General notes on concurrent page splits and page deletion: + * + * Routines like _bt_search() don't require *any* page split interlock + * when descending the tree, including something very light like a buffer + * pin. That's why it's okay that we don't either. This avoidance of any + * need to "couple" buffer locks is the raison d' etre of the Lehman & Yao + * algorithm, in fact. + * + * That leaves deletion. A deleted page won't actually be recycled by + * VACUUM early enough for us to fail to at least follow its right link + * (or left link, or downlink) and find its sibling, because recycling + * does not occur until no possible index scan could land on the page. + * Index scans can follow links with nothing more than their snapshot as + * an interlock and be sure of at least that much. (See page + * recycling/"visible to everyone" notes in nbtree README.) + * + * Furthermore, it's okay if we follow a rightlink and find a half-dead or + * dead (ignorable) page one or more times. There will either be a + * further right link to follow that leads to a live page before too long + * (before passing by parent's rightmost child), or we will find the end + * of the entire level instead (possible when parent page is itself the + * rightmost on its level). + */ + targetnext = opaque->btpo_next; + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + rightpage = palloc_btree_page(state, targetnext); + opaque = BTPageGetOpaque(rightpage); + + if (!P_IGNORE(opaque) || P_RIGHTMOST(opaque)) + break; + + /* + * We landed on a deleted or half-dead sibling page. Step right until + * we locate a live sibling page. + */ + ereport(DEBUG2, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("level %u sibling page in block %u of index \"%s\" was found deleted or half dead", + opaque->btpo_level, targetnext, RelationGetRelationName(state->rel)), + errdetail_internal("Deleted page found when building scankey from right sibling."))); + + targetnext = opaque->btpo_next; + + /* Be slightly more pro-active in freeing this memory, just in case */ + pfree(rightpage); + } + + /* + * No ShareLock held case -- why it's safe to proceed. + * + * Problem: + * + * We must avoid false positive reports of corruption when caller treats + * item returned here as an upper bound on target's last item. In + * general, false positives are disallowed. Avoiding them here when + * caller is !readonly is subtle. + * + * A concurrent page deletion by VACUUM of the target page can result in + * the insertion of items on to this right sibling page that would + * previously have been inserted on our target page. There might have + * been insertions that followed the target's downlink after it was made + * to point to right sibling instead of target by page deletion's first + * phase. The inserters insert items that would belong on target page. + * This race is very tight, but it's possible. This is our only problem. + * + * Non-problems: + * + * We are not hindered by a concurrent page split of the target; we'll + * never land on the second half of the page anyway. A concurrent split + * of the right page will also not matter, because the first data item + * remains the same within the left half, which we'll reliably land on. If + * we had to skip over ignorable/deleted pages, it cannot matter because + * their key space has already been atomically merged with the first + * non-ignorable page we eventually find (doesn't matter whether the page + * we eventually find is a true sibling or a cousin of target, which we go + * into below). + * + * Solution: + * + * Caller knows that it should reverify that target is not ignorable + * (half-dead or deleted) when cross-page sibling item comparison appears + * to indicate corruption (invariant fails). This detects the single race + * condition that exists for caller. This is correct because the + * continued existence of target block as non-ignorable (not half-dead or + * deleted) implies that target page was not merged into from the right by + * deletion; the key space at or after target never moved left. Target's + * parent either has the same downlink to target as before, or a < + * downlink due to deletion at the left of target. Target either has the + * same highkey as before, or a highkey < before when there is a page + * split. (The rightmost concurrently-split-from-target-page page will + * still have the same highkey as target was originally found to have, + * which for our purposes is equivalent to target's highkey itself never + * changing, since we reliably skip over + * concurrently-split-from-target-page pages.) + * + * In simpler terms, we allow that the key space of the target may expand + * left (the key space can move left on the left side of target only), but + * the target key space cannot expand right and get ahead of us without + * our detecting it. The key space of the target cannot shrink, unless it + * shrinks to zero due to the deletion of the original page, our canary + * condition. (To be very precise, we're a bit stricter than that because + * it might just have been that the target page split and only the + * original target page was deleted. We can be more strict, just not more + * lax.) + * + * Top level tree walk caller moves on to next page (makes it the new + * target) following recovery from this race. (cf. The rationale for + * child/downlink verification needing a ShareLock within + * bt_child_check(), where page deletion is also the main source of + * trouble.) + * + * Note that it doesn't matter if right sibling page here is actually a + * cousin page, because in order for the key space to be readjusted in a + * way that causes us issues in next level up (guiding problematic + * concurrent insertions to the cousin from the grandparent rather than to + * the sibling from the parent), there'd have to be page deletion of + * target's parent page (affecting target's parent's downlink in target's + * grandparent page). Internal page deletion only occurs when there are + * no child pages (they were all fully deleted), and caller is checking + * that the target's parent has at least one non-deleted (so + * non-ignorable) child: the target page. (Note that the first phase of + * deletion atomically marks the page to be deleted half-dead/ignorable at + * the same time downlink in its parent is removed, so caller will + * definitely not fail to detect that this happened.) + * + * This trick is inspired by the method backward scans use for dealing + * with concurrent page splits; concurrent page deletion is a problem that + * similarly receives special consideration sometimes (it's possible that + * the backwards scan will re-read its "original" block after failing to + * find a right-link to it, having already moved in the opposite direction + * (right/"forwards") a few times to try to locate one). Just like us, + * that happens only to determine if there was a concurrent page deletion + * of a reference page, and just like us if there was a page deletion of + * that reference page it means we can move on from caring about the + * reference page. See the nbtree README for a full description of how + * that works. + */ + nline = PageGetMaxOffsetNumber(rightpage); + + /* + * Get first data item, if any + */ + if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque)) + { + /* Return first data item (if any) */ + rightitem = PageGetItemIdCareful(state, targetnext, rightpage, + P_FIRSTDATAKEY(opaque)); + } + else if (!P_ISLEAF(opaque) && + nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque))) + { + /* + * Return first item after the internal page's "negative infinity" + * item + */ + rightitem = PageGetItemIdCareful(state, targetnext, rightpage, + OffsetNumberNext(P_FIRSTDATAKEY(opaque))); + } + else + { + /* + * No first item. Page is probably empty leaf page, but it's also + * possible that it's an internal page with only a negative infinity + * item. + */ + ereport(DEBUG2, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("%s block %u of index \"%s\" has no first data item", + P_ISLEAF(opaque) ? "leaf" : "internal", targetnext, + RelationGetRelationName(state->rel)))); + return NULL; + } + + /* + * Return first real item scankey. Note that this relies on right page + * memory remaining allocated. + */ + firstitup = (IndexTuple) PageGetItem(rightpage, rightitem); + return bt_mkscankey_pivotsearch(state->rel, firstitup); +} + +/* + * Check if two tuples are binary identical except the block number. So, + * this function is capable to compare pivot keys on different levels. + */ +static bool +bt_pivot_tuple_identical(bool heapkeyspace, IndexTuple itup1, IndexTuple itup2) +{ + if (IndexTupleSize(itup1) != IndexTupleSize(itup2)) + return false; + + if (heapkeyspace) + { + /* + * Offset number will contain important information in heapkeyspace + * indexes: the number of attributes left in the pivot tuple following + * suffix truncation. Don't skip over it (compare it too). + */ + if (memcmp(&itup1->t_tid.ip_posid, &itup2->t_tid.ip_posid, + IndexTupleSize(itup1) - + offsetof(ItemPointerData, ip_posid)) != 0) + return false; + } + else + { + /* + * Cannot rely on offset number field having consistent value across + * levels on pg_upgrade'd !heapkeyspace indexes. Compare contents of + * tuple starting from just after item pointer (i.e. after block + * number and offset number). + */ + if (memcmp(&itup1->t_info, &itup2->t_info, + IndexTupleSize(itup1) - + offsetof(IndexTupleData, t_info)) != 0) + return false; + } + + return true; +} + +/*--- + * Check high keys on the child level. Traverse rightlinks from previous + * downlink to the current one. Check that there are no intermediate pages + * with missing downlinks. + * + * If 'loaded_child' is given, it's assumed to be the page pointed to by the + * downlink referenced by 'downlinkoffnum' of the target page. + * + * Basically this function is called for each target downlink and checks two + * invariants: + * + * 1) You can reach the next child from previous one via rightlinks; + * 2) Each child high key have matching pivot key on target level. + * + * Consider the sample tree picture. + * + * 1 + * / \ + * 2 <-> 3 + * / \ / \ + * 4 <> 5 <> 6 <> 7 <> 8 + * + * This function will be called for blocks 4, 5, 6 and 8. Consider what is + * happening for each function call. + * + * - The function call for block 4 initializes data structure and matches high + * key of block 4 to downlink's pivot key of block 2. + * - The high key of block 5 is matched to the high key of block 2. + * - The block 6 has an incomplete split flag set, so its high key isn't + * matched to anything. + * - The function call for block 8 checks that block 8 can be found while + * following rightlinks from block 6. The high key of block 7 will be + * matched to downlink's pivot key in block 3. + * + * There is also final call of this function, which checks that there is no + * missing downlinks for children to the right of the child referenced by + * rightmost downlink in target level. + */ +static void +bt_child_highkey_check(BtreeCheckState *state, + OffsetNumber target_downlinkoffnum, + Page loaded_child, + uint32 target_level) +{ + BlockNumber blkno = state->prevrightlink; + Page page; + BTPageOpaque opaque; + bool rightsplit = state->previncompletesplit; + bool first = true; + ItemId itemid; + IndexTuple itup; + BlockNumber downlink; + + if (OffsetNumberIsValid(target_downlinkoffnum)) + { + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, target_downlinkoffnum); + itup = (IndexTuple) PageGetItem(state->target, itemid); + downlink = BTreeTupleGetDownLink(itup); + } + else + { + downlink = P_NONE; + } + + /* + * If no previous rightlink is memorized for current level just below + * target page's level, we are about to start from the leftmost page. We + * can't follow rightlinks from previous page, because there is no + * previous page. But we still can match high key. + * + * So we initialize variables for the loop above like there is previous + * page referencing current child. Also we imply previous page to not + * have incomplete split flag, that would make us require downlink for + * current child. That's correct, because leftmost page on the level + * should always have parent downlink. + */ + if (!BlockNumberIsValid(blkno)) + { + blkno = downlink; + rightsplit = false; + } + + /* Move to the right on the child level */ + while (true) + { + /* + * Did we traverse the whole tree level and this is check for pages to + * the right of rightmost downlink? + */ + if (blkno == P_NONE && downlink == P_NONE) + { + state->prevrightlink = InvalidBlockNumber; + state->previncompletesplit = false; + return; + } + + /* Did we traverse the whole tree level and don't find next downlink? */ + if (blkno == P_NONE) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"", + state->prevrightlink, downlink, + RelationGetRelationName(state->rel)))); + + /* Load page contents */ + if (blkno == downlink && loaded_child) + page = loaded_child; + else + page = palloc_btree_page(state, blkno); + + opaque = BTPageGetOpaque(page); + + /* The first page we visit at the level should be leftmost */ + if (first && !BlockNumberIsValid(state->prevrightlink) && + !bt_leftmost_ignoring_half_dead(state, blkno, opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + LSN_FORMAT_ARGS(state->targetlsn)))); + + /* Do level sanity check */ + if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) && + opaque->btpo_level != target_level - 1) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block found while following rightlinks from child of index \"%s\" has invalid level", + RelationGetRelationName(state->rel)), + errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", + blkno, target_level - 1, opaque->btpo_level))); + + /* Try to detect circular links */ + if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("circular link chain found in block %u of index \"%s\"", + blkno, RelationGetRelationName(state->rel)))); + + if (blkno != downlink && !P_IGNORE(opaque)) + { + /* blkno probably has missing parent downlink */ + bt_downlink_missing_check(state, rightsplit, blkno, page); + } + + rightsplit = P_INCOMPLETE_SPLIT(opaque); + + /* + * If we visit page with high key, check that it is equal to the + * target key next to corresponding downlink. + */ + if (!rightsplit && !P_RIGHTMOST(opaque)) + { + BTPageOpaque topaque; + IndexTuple highkey; + OffsetNumber pivotkey_offset; + + /* Get high key */ + itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY); + highkey = (IndexTuple) PageGetItem(page, itemid); + + /* + * There might be two situations when we examine high key. If + * current child page is referenced by given target downlink, we + * should look to the next offset number for matching key from + * target page. + * + * Alternatively, we're following rightlinks somewhere in the + * middle between page referenced by previous target's downlink + * and the page referenced by current target's downlink. If + * current child page hasn't incomplete split flag set, then its + * high key should match to the target's key of current offset + * number. This happens when a previous call here (to + * bt_child_highkey_check()) found an incomplete split, and we + * reach a right sibling page without a downlink -- the right + * sibling page's high key still needs to be matched to a + * separator key on the parent/target level. + * + * Don't apply OffsetNumberNext() to target_downlinkoffnum when we + * already had to step right on the child level. Our traversal of + * the child level must try to move in perfect lockstep behind (to + * the left of) the target/parent level traversal. + */ + if (blkno == downlink) + pivotkey_offset = OffsetNumberNext(target_downlinkoffnum); + else + pivotkey_offset = target_downlinkoffnum; + + topaque = BTPageGetOpaque(state->target); + + if (!offset_is_negative_infinity(topaque, pivotkey_offset)) + { + /* + * If we're looking for the next pivot tuple in target page, + * but there is no more pivot tuples, then we should match to + * high key instead. + */ + if (pivotkey_offset > PageGetMaxOffsetNumber(state->target)) + { + if (P_RIGHTMOST(topaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + LSN_FORMAT_ARGS(state->targetlsn)))); + pivotkey_offset = P_HIKEY; + } + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, pivotkey_offset); + itup = (IndexTuple) PageGetItem(state->target, itemid); + } + else + { + /* + * We cannot try to match child's high key to a negative + * infinity key in target, since there is nothing to compare. + * However, it's still possible to match child's high key + * outside of target page. The reason why we're are is that + * bt_child_highkey_check() was previously called for the + * cousin page of 'loaded_child', which is incomplete split. + * So, now we traverse to the right of that cousin page and + * current child level page under consideration still belongs + * to the subtree of target's left sibling. Thus, we need to + * match child's high key to it's left uncle page high key. + * Thankfully we saved it, it's called a "low key" of target + * page. + */ + if (!state->lowkey) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("can't find left sibling high key in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + LSN_FORMAT_ARGS(state->targetlsn)))); + itup = state->lowkey; + } + + if (!bt_pivot_tuple_identical(state->heapkeyspace, highkey, itup)) + { + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("mismatch between parent key and child high key in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + } + + /* Exit if we already found next downlink */ + if (blkno == downlink) + { + state->prevrightlink = opaque->btpo_next; + state->previncompletesplit = rightsplit; + return; + } + + /* Traverse to the next page using rightlink */ + blkno = opaque->btpo_next; + + /* Free page contents if it's allocated by us */ + if (page != loaded_child) + pfree(page); + first = false; + } +} + +/* + * Checks one of target's downlink against its child page. + * + * Conceptually, the target page continues to be what is checked here. The + * target block is still blamed in the event of finding an invariant violation. + * The downlink insertion into the target is probably where any problem raised + * here arises, and there is no such thing as a parent link, so doing the + * verification this way around is much more practical. + * + * This function visits child page and it's sequentially called for each + * downlink of target page. Assuming this we also check downlink connectivity + * here in order to save child page visits. + */ +static void +bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, + OffsetNumber downlinkoffnum) +{ + ItemId itemid; + IndexTuple itup; + BlockNumber childblock; + OffsetNumber offset; + OffsetNumber maxoffset; + Page child; + BTPageOpaque copaque; + BTPageOpaque topaque; + + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, downlinkoffnum); + itup = (IndexTuple) PageGetItem(state->target, itemid); + childblock = BTreeTupleGetDownLink(itup); + + /* + * Caller must have ShareLock on target relation, because of + * considerations around page deletion by VACUUM. + * + * NB: In general, page deletion deletes the right sibling's downlink, not + * the downlink of the page being deleted; the deleted page's downlink is + * reused for its sibling. The key space is thereby consolidated between + * the deleted page and its right sibling. (We cannot delete a parent + * page's rightmost child unless it is the last child page, and we intend + * to also delete the parent itself.) + * + * If this verification happened without a ShareLock, the following race + * condition could cause false positives: + * + * In general, concurrent page deletion might occur, including deletion of + * the left sibling of the child page that is examined here. If such a + * page deletion were to occur, closely followed by an insertion into the + * newly expanded key space of the child, a window for the false positive + * opens up: the stale parent/target downlink originally followed to get + * to the child legitimately ceases to be a lower bound on all items in + * the page, since the key space was concurrently expanded "left". + * (Insertion followed the "new" downlink for the child, not our now-stale + * downlink, which was concurrently physically removed in target/parent as + * part of deletion's first phase.) + * + * While we use various techniques elsewhere to perform cross-page + * verification for !readonly callers, a similar trick seems difficult + * here. The tricks used by bt_recheck_sibling_links and by + * bt_right_page_check_scankey both involve verification of a same-level, + * cross-sibling invariant. Cross-level invariants are far more squishy, + * though. The nbtree REDO routines do not actually couple buffer locks + * across levels during page splits, so making any cross-level check work + * reliably in !readonly mode may be impossible. + */ + Assert(state->readonly); + + /* + * Verify child page has the downlink key from target page (its parent) as + * a lower bound; downlink must be strictly less than all keys on the + * page. + * + * Check all items, rather than checking just the first and trusting that + * the operator class obeys the transitive law. + */ + topaque = BTPageGetOpaque(state->target); + child = palloc_btree_page(state, childblock); + copaque = BTPageGetOpaque(child); + maxoffset = PageGetMaxOffsetNumber(child); + + /* + * Since we've already loaded the child block, combine this check with + * check for downlink connectivity. + */ + bt_child_highkey_check(state, downlinkoffnum, + child, topaque->btpo_level); + + /* + * Since there cannot be a concurrent VACUUM operation in readonly mode, + * and since a page has no links within other pages (siblings and parent) + * once it is marked fully deleted, it should be impossible to land on a + * fully deleted page. + * + * It does not quite make sense to enforce that the page cannot even be + * half-dead, despite the fact the downlink is modified at the same stage + * that the child leaf page is marked half-dead. That's incorrect because + * there may occasionally be multiple downlinks from a chain of pages + * undergoing deletion, where multiple successive calls are made to + * _bt_unlink_halfdead_page() by VACUUM before it can finally safely mark + * the leaf page as fully dead. While _bt_mark_page_halfdead() usually + * removes the downlink to the leaf page that is marked half-dead, that's + * not guaranteed, so it's possible we'll land on a half-dead page with a + * downlink due to an interrupted multi-level page deletion. + * + * We go ahead with our checks if the child page is half-dead. It's safe + * to do so because we do not test the child's high key, so it does not + * matter that the original high key will have been replaced by a dummy + * truncated high key within _bt_mark_page_halfdead(). All other page + * items are left intact on a half-dead page, so there is still something + * to test. + */ + if (P_ISDELETED(copaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("downlink to deleted page found in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.", + state->targetblock, childblock, + LSN_FORMAT_ARGS(state->targetlsn)))); + + for (offset = P_FIRSTDATAKEY(copaque); + offset <= maxoffset; + offset = OffsetNumberNext(offset)) + { + /* + * Skip comparison of target page key against "negative infinity" + * item, if any. Checking it would indicate that it's not a strict + * lower bound, but that's only because of the hard-coding for + * negative infinity items within _bt_compare(). + * + * If nbtree didn't truncate negative infinity tuples during internal + * page splits then we'd expect child's negative infinity key to be + * equal to the scankey/downlink from target/parent (it would be a + * "low key" in this hypothetical scenario, and so it would still need + * to be treated as a special case here). + * + * Negative infinity items can be thought of as a strict lower bound + * that works transitively, with the last non-negative-infinity pivot + * followed during a descent from the root as its "true" strict lower + * bound. Only a small number of negative infinity items are truly + * negative infinity; those that are the first items of leftmost + * internal pages. In more general terms, a negative infinity item is + * only negative infinity with respect to the subtree that the page is + * at the root of. + * + * See also: bt_rootdescend(), which can even detect transitive + * inconsistencies on cousin leaf pages. + */ + if (offset_is_negative_infinity(copaque, offset)) + continue; + + if (!invariant_l_nontarget_offset(state, targetkey, childblock, child, + offset)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("down-link lower bound invariant violated for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%X.", + state->targetblock, childblock, offset, + LSN_FORMAT_ARGS(state->targetlsn)))); + } + + pfree(child); +} + +/* + * Checks if page is missing a downlink that it should have. + * + * A page that lacks a downlink/parent may indicate corruption. However, we + * must account for the fact that a missing downlink can occasionally be + * encountered in a non-corrupt index. This can be due to an interrupted page + * split, or an interrupted multi-level page deletion (i.e. there was a hard + * crash or an error during a page split, or while VACUUM was deleting a + * multi-level chain of pages). + * + * Note that this can only be called in readonly mode, so there is no need to + * be concerned about concurrent page splits or page deletions. + */ +static void +bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber blkno, Page page) +{ + BTPageOpaque opaque = BTPageGetOpaque(page); + ItemId itemid; + IndexTuple itup; + Page child; + BTPageOpaque copaque; + uint32 level; + BlockNumber childblk; + XLogRecPtr pagelsn; + + Assert(state->readonly); + Assert(!P_IGNORE(opaque)); + + /* No next level up with downlinks to fingerprint from the true root */ + if (P_ISROOT(opaque)) + return; + + pagelsn = PageGetLSN(page); + + /* + * Incomplete (interrupted) page splits can account for the lack of a + * downlink. Some inserting transaction should eventually complete the + * page split in passing, when it notices that the left sibling page is + * P_INCOMPLETE_SPLIT(). + * + * In general, VACUUM is not prepared for there to be no downlink to a + * page that it deletes. This is the main reason why the lack of a + * downlink can be reported as corruption here. It's not obvious that an + * invalid missing downlink can result in wrong answers to queries, + * though, since index scans that land on the child may end up + * consistently moving right. The handling of concurrent page splits (and + * page deletions) within _bt_moveright() cannot distinguish + * inconsistencies that last for a moment from inconsistencies that are + * permanent and irrecoverable. + * + * VACUUM isn't even prepared to delete pages that have no downlink due to + * an incomplete page split, but it can detect and reason about that case + * by design, so it shouldn't be taken to indicate corruption. See + * _bt_pagedel() for full details. + */ + if (rightsplit) + { + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("harmless interrupted page split detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", + blkno, opaque->btpo_level, + opaque->btpo_prev, + LSN_FORMAT_ARGS(pagelsn)))); + return; + } + + /* + * Page under check is probably the "top parent" of a multi-level page + * deletion. We'll need to descend the subtree to make sure that + * descendant pages are consistent with that, though. + * + * If the page (which must be non-ignorable) is a leaf page, then clearly + * it can't be the top parent. The lack of a downlink is probably a + * symptom of a broad problem that could just as easily cause + * inconsistencies anywhere else. + */ + if (P_ISLEAF(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("leaf index block lacks downlink in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u page lsn=%X/%X.", + blkno, + LSN_FORMAT_ARGS(pagelsn)))); + + /* Descend from the given page, which is an internal page */ + elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", + RelationGetRelationName(state->rel)); + + level = opaque->btpo_level; + itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque)); + itup = (IndexTuple) PageGetItem(page, itemid); + childblk = BTreeTupleGetDownLink(itup); + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + child = palloc_btree_page(state, childblk); + copaque = BTPageGetOpaque(child); + + if (P_ISLEAF(copaque)) + break; + + /* Do an extra sanity check in passing on internal pages */ + if (copaque->btpo_level != level - 1) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", + RelationGetRelationName(state->rel)), + errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", + blkno, childblk, + level - 1, copaque->btpo_level))); + + level = copaque->btpo_level; + itemid = PageGetItemIdCareful(state, childblk, child, + P_FIRSTDATAKEY(copaque)); + itup = (IndexTuple) PageGetItem(child, itemid); + childblk = BTreeTupleGetDownLink(itup); + /* Be slightly more pro-active in freeing this memory, just in case */ + pfree(child); + } + + /* + * Since there cannot be a concurrent VACUUM operation in readonly mode, + * and since a page has no links within other pages (siblings and parent) + * once it is marked fully deleted, it should be impossible to land on a + * fully deleted page. See bt_child_check() for further details. + * + * The bt_child_check() P_ISDELETED() check is repeated here because + * bt_child_check() does not visit pages reachable through negative + * infinity items. Besides, bt_child_check() is unwilling to descend + * multiple levels. (The similar bt_child_check() P_ISDELETED() check + * within bt_check_level_from_leftmost() won't reach the page either, + * since the leaf's live siblings should have their sibling links updated + * to bypass the deletion target page when it is marked fully dead.) + * + * If this error is raised, it might be due to a previous multi-level page + * deletion that failed to realize that it wasn't yet safe to mark the + * leaf page as fully dead. A "dangling downlink" will still remain when + * this happens. The fact that the dangling downlink's page (the leaf's + * parent/ancestor page) lacked a downlink is incidental. + */ + if (P_ISDELETED(copaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("downlink to deleted leaf page found in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.", + blkno, childblk, + LSN_FORMAT_ARGS(pagelsn)))); + + /* + * Iff leaf page is half-dead, its high key top parent link should point + * to what VACUUM considered to be the top parent page at the instant it + * was interrupted. Provided the high key link actually points to the + * page under check, the missing downlink we detected is consistent with + * there having been an interrupted multi-level page deletion. This means + * that the subtree with the page under check at its root (a page deletion + * chain) is in a consistent state, enabling VACUUM to resume deleting the + * entire chain the next time it encounters the half-dead leaf page. + */ + if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque)) + { + itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY); + itup = (IndexTuple) PageGetItem(child, itemid); + if (BTreeTupleGetTopParent(itup) == blkno) + return; + } + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("internal index block lacks downlink in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u level=%u page lsn=%X/%X.", + blkno, opaque->btpo_level, + LSN_FORMAT_ARGS(pagelsn)))); +} + +/* + * Per-tuple callback from table_index_build_scan, used to determine if index has + * all the entries that definitely should have been observed in leaf pages of + * the target index (that is, all IndexTuples that were fingerprinted by our + * Bloom filter). All heapallindexed checks occur here. + * + * The redundancy between an index and the table it indexes provides a good + * opportunity to detect corruption, especially corruption within the table. + * The high level principle behind the verification performed here is that any + * IndexTuple that should be in an index following a fresh CREATE INDEX (based + * on the same index definition) should also have been in the original, + * existing index, which should have used exactly the same representation + * + * Since the overall structure of the index has already been verified, the most + * likely explanation for error here is a corrupt heap page (could be logical + * or physical corruption). Index corruption may still be detected here, + * though. Only readonly callers will have verified that left links and right + * links are in agreement, and so it's possible that a leaf page transposition + * within index is actually the source of corruption detected here (for + * !readonly callers). The checks performed only for readonly callers might + * more accurately frame the problem as a cross-page invariant issue (this + * could even be due to recovery not replaying all WAL records). The !readonly + * ERROR message raised here includes a HINT about retrying with readonly + * verification, just in case it's a cross-page invariant issue, though that + * isn't particularly likely. + * + * table_index_build_scan() expects to be able to find the root tuple when a + * heap-only tuple (the live tuple at the end of some HOT chain) needs to be + * indexed, in order to replace the actual tuple's TID with the root tuple's + * TID (which is what we're actually passed back here). The index build heap + * scan code will raise an error when a tuple that claims to be the root of the + * heap-only tuple's HOT chain cannot be located. This catches cases where the + * original root item offset/root tuple for a HOT chain indicates (for whatever + * reason) that the entire HOT chain is dead, despite the fact that the latest + * heap-only tuple should be indexed. When this happens, sequential scans may + * always give correct answers, and all indexes may be considered structurally + * consistent (i.e. the nbtree structural checks would not detect corruption). + * It may be the case that only index scans give wrong answers, and yet heap or + * SLRU corruption is the real culprit. (While it's true that LP_DEAD bit + * setting will probably also leave the index in a corrupt state before too + * long, the problem is nonetheless that there is heap corruption.) + * + * Heap-only tuple handling within table_index_build_scan() works in a way that + * helps us to detect index tuples that contain the wrong values (values that + * don't match the latest tuple in the HOT chain). This can happen when there + * is no superseding index tuple due to a faulty assessment of HOT safety, + * perhaps during the original CREATE INDEX. Because the latest tuple's + * contents are used with the root TID, an error will be raised when a tuple + * with the same TID but non-matching attribute values is passed back to us. + * Faulty assessment of HOT-safety was behind at least two distinct CREATE + * INDEX CONCURRENTLY bugs that made it into stable releases, one of which was + * undetected for many years. In short, the same principle that allows a + * REINDEX to repair corruption when there was an (undetected) broken HOT chain + * also allows us to detect the corruption in many cases. + */ +static void +bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, + bool *isnull, bool tupleIsAlive, void *checkstate) +{ + BtreeCheckState *state = (BtreeCheckState *) checkstate; + IndexTuple itup, + norm; + + Assert(state->heapallindexed); + + /* Generate a normalized index tuple for fingerprinting */ + itup = index_form_tuple(RelationGetDescr(index), values, isnull); + itup->t_tid = *tid; + norm = bt_normalize_tuple(state, itup); + + /* Probe Bloom filter -- tuple should be present */ + if (bloom_lacks_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm))) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid)), + RelationGetRelationName(state->heaprel), + RelationGetRelationName(state->rel)), + !state->readonly + ? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.") + : 0)); + + state->heaptuplespresent++; + pfree(itup); + /* Cannot leak memory here */ + if (norm != itup) + pfree(norm); +} + +/* + * Normalize an index tuple for fingerprinting. + * + * In general, index tuple formation is assumed to be deterministic by + * heapallindexed verification, and IndexTuples are assumed immutable. While + * the LP_DEAD bit is mutable in leaf pages, that's ItemId metadata, which is + * not fingerprinted. Normalization is required to compensate for corner + * cases where the determinism assumption doesn't quite work. + * + * There is currently one such case: index_form_tuple() does not try to hide + * the source TOAST state of input datums. The executor applies TOAST + * compression for heap tuples based on different criteria to the compression + * applied within btinsert()'s call to index_form_tuple(): it sometimes + * compresses more aggressively, resulting in compressed heap tuple datums but + * uncompressed corresponding index tuple datums. A subsequent heapallindexed + * verification will get a logically equivalent though bitwise unequal tuple + * from index_form_tuple(). False positive heapallindexed corruption reports + * could occur without normalizing away the inconsistency. + * + * Returned tuple is often caller's own original tuple. Otherwise, it is a + * new representation of caller's original index tuple, palloc()'d in caller's + * memory context. + * + * Note: This routine is not concerned with distinctions about the + * representation of tuples beyond those that might break heapallindexed + * verification. In particular, it won't try to normalize opclass-equal + * datums with potentially distinct representations (e.g., btree/numeric_ops + * index datums will not get their display scale normalized-away here). + * Caller does normalization for non-pivot tuples that have a posting list, + * since dummy CREATE INDEX callback code generates new tuples with the same + * normalized representation. + */ +static IndexTuple +bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) +{ + TupleDesc tupleDescriptor = RelationGetDescr(state->rel); + Datum normalized[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + bool toast_free[INDEX_MAX_KEYS]; + bool formnewtup = false; + IndexTuple reformed; + int i; + + /* Caller should only pass "logical" non-pivot tuples here */ + Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup)); + + /* Easy case: It's immediately clear that tuple has no varlena datums */ + if (!IndexTupleHasVarwidths(itup)) + return itup; + + for (i = 0; i < tupleDescriptor->natts; i++) + { + Form_pg_attribute att; + + att = TupleDescAttr(tupleDescriptor, i); + + /* Assume untoasted/already normalized datum initially */ + toast_free[i] = false; + normalized[i] = index_getattr(itup, att->attnum, + tupleDescriptor, + &isnull[i]); + if (att->attbyval || att->attlen != -1 || isnull[i]) + continue; + + /* + * Callers always pass a tuple that could safely be inserted into the + * index without further processing, so an external varlena header + * should never be encountered here + */ + if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i]))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid)), + RelationGetRelationName(state->rel)))); + else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i]))) + { + formnewtup = true; + normalized[i] = PointerGetDatum(PG_DETOAST_DATUM(normalized[i])); + toast_free[i] = true; + } + } + + /* Easier case: Tuple has varlena datums, none of which are compressed */ + if (!formnewtup) + return itup; + + /* + * Hard case: Tuple had compressed varlena datums that necessitate + * creating normalized version of the tuple from uncompressed input datums + * (normalized input datums). This is rather naive, but shouldn't be + * necessary too often. + * + * Note that we rely on deterministic index_form_tuple() TOAST compression + * of normalized input. + */ + reformed = index_form_tuple(tupleDescriptor, normalized, isnull); + reformed->t_tid = itup->t_tid; + + /* Cannot leak memory here */ + for (i = 0; i < tupleDescriptor->natts; i++) + if (toast_free[i]) + pfree(DatumGetPointer(normalized[i])); + + return reformed; +} + +/* + * Produce palloc()'d "plain" tuple for nth posting list entry/TID. + * + * In general, deduplication is not supposed to change the logical contents of + * an index. Multiple index tuples are merged together into one equivalent + * posting list index tuple when convenient. + * + * heapallindexed verification must normalize-away this variation in + * representation by converting posting list tuples into two or more "plain" + * tuples. Each tuple must be fingerprinted separately -- there must be one + * tuple for each corresponding Bloom filter probe during the heap scan. + * + * Note: Caller still needs to call bt_normalize_tuple() with returned tuple. + */ +static inline IndexTuple +bt_posting_plain_tuple(IndexTuple itup, int n) +{ + Assert(BTreeTupleIsPosting(itup)); + + /* Returns non-posting-list tuple */ + return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1); +} + +/* + * Search for itup in index, starting from fast root page. itup must be a + * non-pivot tuple. This is only supported with heapkeyspace indexes, since + * we rely on having fully unique keys to find a match with only a single + * visit to a leaf page, barring an interrupted page split, where we may have + * to move right. (A concurrent page split is impossible because caller must + * be readonly caller.) + * + * This routine can detect very subtle transitive consistency issues across + * more than one level of the tree. Leaf pages all have a high key (even the + * rightmost page has a conceptual positive infinity high key), but not a low + * key. Their downlink in parent is a lower bound, which along with the high + * key is almost enough to detect every possible inconsistency. A downlink + * separator key value won't always be available from parent, though, because + * the first items of internal pages are negative infinity items, truncated + * down to zero attributes during internal page splits. While it's true that + * bt_child_check() and the high key check can detect most imaginable key + * space problems, there are remaining problems it won't detect with non-pivot + * tuples in cousin leaf pages. Starting a search from the root for every + * existing leaf tuple detects small inconsistencies in upper levels of the + * tree that cannot be detected any other way. (Besides all this, this is + * probably also useful as a direct test of the code used by index scans + * themselves.) + */ +static bool +bt_rootdescend(BtreeCheckState *state, IndexTuple itup) +{ + BTScanInsert key; + BTStack stack; + Buffer lbuf; + bool exists; + + key = _bt_mkscankey(state->rel, itup); + Assert(key->heapkeyspace && key->scantid != NULL); + + /* + * Search from root. + * + * Ideally, we would arrange to only move right within _bt_search() when + * an interrupted page split is detected (i.e. when the incomplete split + * bit is found to be set), but for now we accept the possibility that + * that could conceal an inconsistency. + */ + Assert(state->readonly && state->rootdescend); + exists = false; + stack = _bt_search(state->rel, NULL, key, &lbuf, BT_READ, NULL); + + if (BufferIsValid(lbuf)) + { + BTInsertStateData insertstate; + OffsetNumber offnum; + Page page; + + insertstate.itup = itup; + insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); + insertstate.itup_key = key; + insertstate.postingoff = 0; + insertstate.bounds_valid = false; + insertstate.buf = lbuf; + + /* Get matching tuple on leaf page */ + offnum = _bt_binsrch_insert(state->rel, &insertstate); + /* Compare first >= matching item on leaf page, if any */ + page = BufferGetPage(lbuf); + /* Should match on first heap TID when tuple has a posting list */ + if (offnum <= PageGetMaxOffsetNumber(page) && + insertstate.postingoff <= 0 && + _bt_compare(state->rel, key, page, offnum) == 0) + exists = true; + _bt_relbuf(state->rel, lbuf); + } + + _bt_freestack(stack); + pfree(key); + + return exists; +} + +/* + * Is particular offset within page (whose special state is passed by caller) + * the page negative-infinity item? + * + * As noted in comments above _bt_compare(), there is special handling of the + * first data item as a "negative infinity" item. The hard-coding within + * _bt_compare() makes comparing this item for the purposes of verification + * pointless at best, since the IndexTuple only contains a valid TID (a + * reference TID to child page). + */ +static inline bool +offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset) +{ + /* + * For internal pages only, the first item after high key, if any, is + * negative infinity item. Internal pages always have a negative infinity + * item, whereas leaf pages never have one. This implies that negative + * infinity item is either first or second line item, or there is none + * within page. + * + * Negative infinity items are a special case among pivot tuples. They + * always have zero attributes, while all other pivot tuples always have + * nkeyatts attributes. + * + * Right-most pages don't have a high key, but could be said to + * conceptually have a "positive infinity" high key. Thus, there is a + * symmetry between down link items in parent pages, and high keys in + * children. Together, they represent the part of the key space that + * belongs to each page in the index. For example, all children of the + * root page will have negative infinity as a lower bound from root + * negative infinity downlink, and positive infinity as an upper bound + * (implicitly, from "imaginary" positive infinity high key in root). + */ + return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque); +} + +/* + * Does the invariant hold that the key is strictly less than a given upper + * bound offset item? + * + * Verifies line pointer on behalf of caller. + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_l_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber upperbound) +{ + ItemId itemid; + int32 cmp; + + Assert(key->pivotsearch); + + /* Verify line pointer before checking tuple */ + itemid = PageGetItemIdCareful(state, state->targetblock, state->target, + upperbound); + /* pg_upgrade'd indexes may legally have equal sibling tuples */ + if (!key->heapkeyspace) + return invariant_leq_offset(state, key, upperbound); + + cmp = _bt_compare(state->rel, key, state->target, upperbound); + + /* + * _bt_compare() is capable of determining that a scankey with a + * filled-out attribute is greater than pivot tuples where the comparison + * is resolved at a truncated attribute (value of attribute in pivot is + * minus infinity). However, it is not capable of determining that a + * scankey is _less than_ a tuple on the basis of a comparison resolved at + * _scankey_ minus infinity attribute. Complete an extra step to simulate + * having minus infinity values for omitted scankey attribute(s). + */ + if (cmp == 0) + { + BTPageOpaque topaque; + IndexTuple ritup; + int uppnkeyatts; + ItemPointer rheaptid; + bool nonpivot; + + ritup = (IndexTuple) PageGetItem(state->target, itemid); + topaque = BTPageGetOpaque(state->target); + nonpivot = P_ISLEAF(topaque) && upperbound >= P_FIRSTDATAKEY(topaque); + + /* Get number of keys + heap TID for item to the right */ + uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel); + rheaptid = BTreeTupleGetHeapTIDCareful(state, ritup, nonpivot); + + /* Heap TID is tiebreaker key attribute */ + if (key->keysz == uppnkeyatts) + return key->scantid == NULL && rheaptid != NULL; + + return key->keysz < uppnkeyatts; + } + + return cmp < 0; +} + +/* + * Does the invariant hold that the key is less than or equal to a given upper + * bound offset item? + * + * Caller should have verified that upperbound's line pointer is consistent + * using PageGetItemIdCareful() call. + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber upperbound) +{ + int32 cmp; + + Assert(key->pivotsearch); + + cmp = _bt_compare(state->rel, key, state->target, upperbound); + + return cmp <= 0; +} + +/* + * Does the invariant hold that the key is strictly greater than a given lower + * bound offset item? + * + * Caller should have verified that lowerbound's line pointer is consistent + * using PageGetItemIdCareful() call. + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_g_offset(BtreeCheckState *state, BTScanInsert key, + OffsetNumber lowerbound) +{ + int32 cmp; + + Assert(key->pivotsearch); + + cmp = _bt_compare(state->rel, key, state->target, lowerbound); + + /* pg_upgrade'd indexes may legally have equal sibling tuples */ + if (!key->heapkeyspace) + return cmp >= 0; + + /* + * No need to consider the possibility that scankey has attributes that we + * need to force to be interpreted as negative infinity. _bt_compare() is + * able to determine that scankey is greater than negative infinity. The + * distinction between "==" and "<" isn't interesting here, since + * corruption is indicated either way. + */ + return cmp > 0; +} + +/* + * Does the invariant hold that the key is strictly less than a given upper + * bound offset item, with the offset relating to a caller-supplied page that + * is not the current target page? + * + * Caller's non-target page is a child page of the target, checked as part of + * checking a property of the target page (i.e. the key comes from the + * target). Verifies line pointer on behalf of caller. + * + * If this function returns false, convention is that caller throws error due + * to corruption. + */ +static inline bool +invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, + BlockNumber nontargetblock, Page nontarget, + OffsetNumber upperbound) +{ + ItemId itemid; + int32 cmp; + + Assert(key->pivotsearch); + + /* Verify line pointer before checking tuple */ + itemid = PageGetItemIdCareful(state, nontargetblock, nontarget, + upperbound); + cmp = _bt_compare(state->rel, key, nontarget, upperbound); + + /* pg_upgrade'd indexes may legally have equal sibling tuples */ + if (!key->heapkeyspace) + return cmp <= 0; + + /* See invariant_l_offset() for an explanation of this extra step */ + if (cmp == 0) + { + IndexTuple child; + int uppnkeyatts; + ItemPointer childheaptid; + BTPageOpaque copaque; + bool nonpivot; + + child = (IndexTuple) PageGetItem(nontarget, itemid); + copaque = BTPageGetOpaque(nontarget); + nonpivot = P_ISLEAF(copaque) && upperbound >= P_FIRSTDATAKEY(copaque); + + /* Get number of keys + heap TID for child/non-target item */ + uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel); + childheaptid = BTreeTupleGetHeapTIDCareful(state, child, nonpivot); + + /* Heap TID is tiebreaker key attribute */ + if (key->keysz == uppnkeyatts) + return key->scantid == NULL && childheaptid != NULL; + + return key->keysz < uppnkeyatts; + } + + return cmp < 0; +} + +/* + * Given a block number of a B-Tree page, return page in palloc()'d memory. + * While at it, perform some basic checks of the page. + * + * There is never an attempt to get a consistent view of multiple pages using + * multiple concurrent buffer locks; in general, we only acquire a single pin + * and buffer lock at a time, which is often all that the nbtree code requires. + * (Actually, bt_recheck_sibling_links couples buffer locks, which is the only + * exception to this general rule.) + * + * Operating on a copy of the page is useful because it prevents control + * getting stuck in an uninterruptible state when an underlying operator class + * misbehaves. + */ +static Page +palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) +{ + Buffer buffer; + Page page; + BTPageOpaque opaque; + OffsetNumber maxoffset; + + page = palloc(BLCKSZ); + + /* + * We copy the page into local storage to avoid holding pin on the buffer + * longer than we must. + */ + buffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, blocknum, RBM_NORMAL, + state->checkstrategy); + LockBuffer(buffer, BT_READ); + + /* + * Perform the same basic sanity checking that nbtree itself performs for + * every page: + */ + _bt_checkpage(state->rel, buffer); + + /* Only use copy of page in palloc()'d memory */ + memcpy(page, BufferGetPage(buffer), BLCKSZ); + UnlockReleaseBuffer(buffer); + + opaque = BTPageGetOpaque(page); + + if (P_ISMETA(opaque) && blocknum != BTREE_METAPAGE) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid meta page found at block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)))); + + /* Check page from block that ought to be meta page */ + if (blocknum == BTREE_METAPAGE) + { + BTMetaPageData *metad = BTPageGetMeta(page); + + if (!P_ISMETA(opaque) || + metad->btm_magic != BTREE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" meta page is corrupt", + RelationGetRelationName(state->rel)))); + + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimum supported version %d", + RelationGetRelationName(state->rel), + metad->btm_version, BTREE_VERSION, + BTREE_MIN_VERSION))); + + /* Finished with metapage checks */ + return page; + } + + /* + * Deleted pages that still use the old 32-bit XID representation have no + * sane "level" field because they type pun the field, but all other pages + * (including pages deleted on Postgres 14+) have a valid value. + */ + if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) + { + /* Okay, no reason not to trust btpo_level field from page */ + + if (P_ISLEAF(opaque) && opaque->btpo_level != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"", + opaque->btpo_level, blocknum, + RelationGetRelationName(state->rel)))); + + if (!P_ISLEAF(opaque) && opaque->btpo_level == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"", + blocknum, + RelationGetRelationName(state->rel)))); + } + + /* + * Sanity checks for number of items on page. + * + * As noted at the beginning of _bt_binsrch(), an internal page must have + * children, since there must always be a negative infinity downlink + * (there may also be a highkey). In the case of non-rightmost leaf + * pages, there must be at least a highkey. The exceptions are deleted + * pages, which contain no items. + * + * This is correct when pages are half-dead, since internal pages are + * never half-dead, and leaf pages must have a high key when half-dead + * (the rightmost page can never be deleted). It's also correct with + * fully deleted pages: _bt_unlink_halfdead_page() doesn't change anything + * about the target page other than setting the page as fully dead, and + * setting its xact field. In particular, it doesn't change the sibling + * links in the deletion target itself, since they're required when index + * scans land on the deletion target, and then need to move right (or need + * to move left, in the case of backward index scans). + */ + maxoffset = PageGetMaxOffsetNumber(page); + if (maxoffset > MaxIndexTuplesPerPage) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("Number of items on block %u of index \"%s\" exceeds MaxIndexTuplesPerPage (%u)", + blocknum, RelationGetRelationName(state->rel), + MaxIndexTuplesPerPage))); + + if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && maxoffset < P_FIRSTDATAKEY(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("internal block %u in index \"%s\" lacks high key and/or at least one downlink", + blocknum, RelationGetRelationName(state->rel)))); + + if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && !P_RIGHTMOST(opaque) && maxoffset < P_HIKEY) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("non-rightmost leaf block %u in index \"%s\" lacks high key item", + blocknum, RelationGetRelationName(state->rel)))); + + /* + * In general, internal pages are never marked half-dead, except on + * versions of Postgres prior to 9.4, where it can be valid transient + * state. This state is nonetheless treated as corruption by VACUUM on + * from version 9.4 on, so do the same here. See _bt_pagedel() for full + * details. + */ + if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("internal page block %u in index \"%s\" is half-dead", + blocknum, RelationGetRelationName(state->rel)), + errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); + + /* + * Check that internal pages have no garbage items, and that no page has + * an invalid combination of deletion-related page level flags + */ + if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("internal page block %u in index \"%s\" has garbage items", + blocknum, RelationGetRelationName(state->rel)))); + + if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)))); + + if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("deleted page block %u in index \"%s\" is half-dead", + blocknum, RelationGetRelationName(state->rel)))); + + return page; +} + +/* + * _bt_mkscankey() wrapper that automatically prevents insertion scankey from + * being considered greater than the pivot tuple that its values originated + * from (or some other identical pivot tuple) in the common case where there + * are truncated/minus infinity attributes. Without this extra step, there + * are forms of corruption that amcheck could theoretically fail to report. + * + * For example, invariant_g_offset() might miss a cross-page invariant failure + * on an internal level if the scankey built from the first item on the + * target's right sibling page happened to be equal to (not greater than) the + * last item on target page. The !pivotsearch tiebreaker in _bt_compare() + * might otherwise cause amcheck to assume (rather than actually verify) that + * the scankey is greater. + */ +static inline BTScanInsert +bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup) +{ + BTScanInsert skey; + + skey = _bt_mkscankey(rel, itup); + skey->pivotsearch = true; + + return skey; +} + +/* + * PageGetItemId() wrapper that validates returned line pointer. + * + * Buffer page/page item access macros generally trust that line pointers are + * not corrupt, which might cause problems for verification itself. For + * example, there is no bounds checking in PageGetItem(). Passing it a + * corrupt line pointer can cause it to return a tuple/pointer that is unsafe + * to dereference. + * + * Validating line pointers before tuples avoids undefined behavior and + * assertion failures with corrupt indexes, making the verification process + * more robust and predictable. + */ +static ItemId +PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, + OffsetNumber offset) +{ + ItemId itemid = PageGetItemId(page, offset); + + if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) > + BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("line pointer points past end of tuple space in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + /* + * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED, since nbtree + * never uses either. Verify that line pointer has storage, too, since + * even LP_DEAD items should within nbtree. + */ + if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) || + ItemIdGetLength(itemid) == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid line pointer storage in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + return itemid; +} + +/* + * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in + * cases where that is mandatory (i.e. for non-pivot tuples) + */ +static inline ItemPointer +BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, + bool nonpivot) +{ + ItemPointer htid; + + /* + * Caller determines whether this is supposed to be a pivot or non-pivot + * tuple using page type and item offset number. Verify that tuple + * metadata agrees with this. + */ + Assert(state->heapkeyspace); + if (BTreeTupleIsPivot(itup) && nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)))); + + if (!BTreeTupleIsPivot(itup) && !nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)))); + + htid = BTreeTupleGetHeapTID(itup); + if (!ItemPointerIsValid(htid) && nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", + state->targetblock, + RelationGetRelationName(state->rel)))); + + return htid; +} + +/* + * Return the "pointed to" TID for itup, which is used to generate a + * descriptive error message. itup must be a "data item" tuple (it wouldn't + * make much sense to call here with a high key tuple, since there won't be a + * valid downlink/block number to display). + * + * Returns either a heap TID (which will be the first heap TID in posting list + * if itup is posting list tuple), or a TID that contains downlink block + * number, plus some encoded metadata (e.g., the number of attributes present + * in itup). + */ +static inline ItemPointer +BTreeTupleGetPointsToTID(IndexTuple itup) +{ + /* + * Rely on the assumption that !heapkeyspace internal page data items will + * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't + * recognize it as a pivot tuple, but everything still works out because + * the t_tid field is still returned + */ + if (!BTreeTupleIsPivot(itup)) + return BTreeTupleGetHeapTID(itup); + + /* Pivot tuple returns TID with downlink block (heapkeyspace variant) */ + return &itup->t_tid; +} |